From d844057a09f6ca8cf0e6f3f935d9fd5dc151d3a1 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Thu, 29 Jan 2026 20:35:35 +0000 Subject: [PATCH 1/4] Claude's first POC --- BedrockServer.cpp | 17 +- libstuff/SSignal.cpp | 167 ++++++++++++++++++ libstuff/SThread.h | 17 +- libstuff/libstuff.h | 42 +++++ test/clustertest/testplugin/TestPlugin.cpp | 49 +++++ .../clustertest/tests/SignalExceptionTest.cpp | 61 +++++++ 6 files changed, 349 insertions(+), 4 deletions(-) create mode 100644 test/clustertest/tests/SignalExceptionTest.cpp diff --git a/BedrockServer.cpp b/BedrockServer.cpp index b56cc80bd..888a7817c 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -2207,12 +2208,14 @@ void BedrockServer::_acceptSockets() } // And start up this socket's thread. + // We use SThread to enable signal recovery - if a SIGSEGV occurs in the handler, + // it will be converted to an exception and the thread will exit gracefully. _outstandingSocketThreads++; - thread t; + pair> threadPair; bool threadStarted = false; while (!threadStarted) { try { - t = thread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); + threadPair = SThread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); threadStarted = true; } catch (const system_error& e) { // We don't care about this lock here from a performance perspective, it only happens when we @@ -2246,7 +2249,11 @@ void BedrockServer::_acceptSockets() } } try { - t.detach(); + // Detach the thread - it will run independently. + // The future (threadPair.second) is discarded. If handleSocket catches the + // SSignalException (it does), the future sees normal completion. If it doesn't + // catch, the exception is stored in the future but never retrieved. + threadPair.first.detach(); } catch (const system_error& e) { SALERT("Caught system_error in thread detach: " << e.code() << ", message: " << e.what()); throw; @@ -2513,6 +2520,10 @@ void BedrockServer::handleSocket(Socket&& socket, bool fromControlPort, bool fro SWARN("Socket in unhandled state: " << socket.state); } } + } catch (const SSignalException& e) { + // Signal-generated exception (SIGSEGV, SIGFPE, etc.) - log with full stack trace. + SALERT("handleSocket caught signal " << e.signalName() << " at " << e.faultAddress()); + e.logStackTrace(); } catch (const exception& e) { SALERT("handleSocket got exception: " << e.what()); } catch (...) { diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index 763ab0f69..87907d526 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -9,6 +9,11 @@ #include #include +// ucontext.h is needed for signal recovery on Linux +#if defined(__linux__) +#include +#endif + thread_local function SSignalHandlerDieFunc; void SSetSignalHandlerDieFunc(function&& func) { @@ -45,6 +50,24 @@ thread _SSignal_signalThread; // `abort()`, this records the original signal number until the signal handler for abort has a chance to log it. thread_local int _SSignal_threadCaughtSignalNumber = 0; +// Thread-local flag indicating this thread should convert signals to exceptions instead of aborting. +// Used by SThread to enable graceful recovery from SIGSEGV/SIGFPE. +thread_local bool _SSignal_threadIsRecoverable = false; + +// Thread-local flag to prevent recursive recovery attempts (e.g., if trampoline itself crashes). +thread_local bool _SSignal_inTrampoline = false; + +// Thread-local storage for crash info to be passed to the trampoline. +struct SSignalCrashInfo { + int signum; + void* faultAddress; + void* instructionPointer; + void* callstack[32]; + int callstackDepth; + bool hasCrashInfo; +}; +thread_local SSignalCrashInfo _SSignal_crashInfo = {0, nullptr, nullptr, {}, 0, false}; + // The number of termination signals received so far. atomic _SSignal_terminationCount(0); uint64_t STerminationSignalCount() @@ -89,6 +112,16 @@ void SClearSignals() _SSignal_pendingSignalBitMask.store(0); } +void SSetThreadRecoverable(bool recoverable) +{ + _SSignal_threadIsRecoverable = recoverable; +} + +bool SIsThreadRecoverable() +{ + return _SSignal_threadIsRecoverable; +} + void SInitializeSignals() { // Our default die function does nothing. @@ -204,8 +237,87 @@ void SStopSignalThread() } } +// Platform-specific helpers for ucontext manipulation. +// These allow us to redirect execution after a signal to a trampoline function. +#if defined(__linux__) +static void _SSignal_SetInstructionPointer(ucontext_t* ctx, void* target) { + #if defined(__x86_64__) + ctx->uc_mcontext.gregs[REG_RIP] = (greg_t)target; + #elif defined(__aarch64__) + ctx->uc_mcontext.pc = (unsigned long)target; + #else + #error "Unsupported architecture for signal recovery" + #endif +} + +static void* _SSignal_GetInstructionPointer(ucontext_t* ctx) { + #if defined(__x86_64__) + return (void*)ctx->uc_mcontext.gregs[REG_RIP]; + #elif defined(__aarch64__) + return (void*)ctx->uc_mcontext.pc; + #else + return nullptr; + #endif +} +#endif + +// Trampoline function that runs after the signal handler returns. +// This executes on the original thread's stack and throws an SSignalException. +// The signal handler redirects execution here by modifying the instruction pointer in ucontext. +__attribute__((noinline, noreturn)) +void _SSignal_Trampoline() { + // Mark that we're in the trampoline to prevent recursive recovery attempts. + _SSignal_inTrampoline = true; + + // Copy crash info from TLS to stack before throwing. + SSignalCrashInfo info = _SSignal_crashInfo; + _SSignal_crashInfo.hasCrashInfo = false; + + // Throw the exception - this will unwind to the catch block in SThread. + throw SSignalException( + info.signum, + info.faultAddress, + info.instructionPointer, + info.callstack, + info.callstackDepth + ); +} + void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) { +#if defined(__linux__) && (defined(__x86_64__) || defined(__aarch64__)) + // Check if this is a recoverable signal in a recoverable thread. + // SIGABRT is not recoverable - it's usually called intentionally or as a result of another crash. + bool isRecoverableSignal = (signum == SIGSEGV || signum == SIGFPE || signum == SIGBUS || signum == SIGILL); + + if (isRecoverableSignal && _SSignal_threadIsRecoverable && !_SSignal_inTrampoline && ucontext != nullptr) { + ucontext_t* ctx = static_cast(ucontext); + + // Store crash information in thread-local storage for the trampoline. + _SSignal_crashInfo.signum = signum; + _SSignal_crashInfo.faultAddress = info ? info->si_addr : nullptr; + _SSignal_crashInfo.instructionPointer = _SSignal_GetInstructionPointer(ctx); + + // Capture backtrace while we have the original context. + // Note: backtrace() is not strictly signal-safe but usually works. + _SSignal_crashInfo.callstackDepth = backtrace( + _SSignal_crashInfo.callstack, + sizeof(_SSignal_crashInfo.callstack) / sizeof(void*) + ); + _SSignal_crashInfo.hasCrashInfo = true; + + SWARN("Signal " << strsignal(signum) << "(" << signum << ") in recoverable thread, " + << "redirecting to exception trampoline. Fault address: " << (info ? info->si_addr : nullptr)); + + // Modify the instruction pointer to point to our trampoline. + // When the signal handler returns, execution will resume at the trampoline. + _SSignal_SetInstructionPointer(ctx, (void*)&_SSignal_Trampoline); + + // Return from signal handler - execution will resume at trampoline on original thread stack. + return; + } +#endif + if (signum == SIGSEGV || signum == SIGABRT || signum == SIGFPE || signum == SIGILL || signum == SIGBUS) { // If we haven't already saved a signal number, we'll do it now. Any signal we catch here will generate a // second ABORT signal, and we don't want that to overwrite this value, so we only set it if unset. @@ -306,3 +418,58 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) SALERT("Non-signal thread got signal " << strsignal(signum) << "(" << signum << "), which wasn't expected"); } } + +// SSignalException implementation +SSignalException::SSignalException(int signum, + void* faultAddress, + void* instructionPointer, + void* const* callstack, + int depth) + : _signum(signum), + _faultAddress(faultAddress), + _instructionPointer(instructionPointer), + _depth(min(depth, CALLSTACK_LIMIT)) +{ + if (callstack && _depth > 0) { + memcpy(_callstack, callstack, _depth * sizeof(void*)); + } +} + +const char* SSignalException::what() const noexcept { + if (_whatMessage.empty()) { + // Build message lazily to avoid issues in constructor context. + try { + _whatMessage = "Signal " + string(signalName()) + " at address " + SToHex((uint64_t)_faultAddress); + } catch (...) { + _whatMessage = "Signal exception"; + } + } + return _whatMessage.c_str(); +} + +const char* SSignalException::signalName() const noexcept { + switch (_signum) { + case SIGSEGV: return "SIGSEGV"; + case SIGFPE: return "SIGFPE"; + case SIGBUS: return "SIGBUS"; + case SIGILL: return "SIGILL"; + default: return "UNKNOWN"; + } +} + +vector SSignalException::stackTrace() const noexcept { + return SGetCallstack(_depth, _callstack); +} + +void SSignalException::logStackTrace() const noexcept { + try { + SWARN("Signal " << signalName() << " at fault address " + << SToHex((uint64_t)_faultAddress) + << ", instruction " << SToHex((uint64_t)_instructionPointer)); + for (const auto& frame : stackTrace()) { + SWARN(" " << frame); + } + } catch (...) { + // Logging failed, but we're noexcept so just ignore. + } +} diff --git a/libstuff/SThread.h b/libstuff/SThread.h index 83fde8b69..edf1521b7 100644 --- a/libstuff/SThread.h +++ b/libstuff/SThread.h @@ -11,7 +11,8 @@ using namespace std; // SThread is a thread wrapper intended to be used in the same way as thread, -// except that it will trap exceptions and pass them back to the caller as part of a promise. +// except that it will trap exceptions (including signal-generated exceptions like SIGSEGV) +// and pass them back to the caller as part of a promise. template auto SThread(F&& f, Args&&... args) { @@ -39,6 +40,12 @@ auto SThread(F&& f, Args&&... args) // Finally we can create our new thread and pass it our function and arguments. thread t( [p = move(prom), fn = move(fn), argTuple = move(argTuple)]() mutable { + // Initialize signal handling for this thread and mark it as recoverable. + // This allows signals like SIGSEGV/SIGFPE to be converted to SSignalException + // instead of aborting the process. + SInitializeSignals(); + SSetThreadRecoverable(true); + try { // We call `apply` to use our argments from a tuple as if they were a list of discrete arguments. // This is effectively like calling `invoke` and passing the arguments separately. @@ -50,6 +57,11 @@ auto SThread(F&& f, Args&&... args) } else { p.set_value(apply(move(fn), move(argTuple))); } + } catch (const SSignalException& e) { + // Signal-generated exception - log stack trace before propagating. + SWARN("Signal exception in SThread: " << e.what()); + e.logStackTrace(); + p.set_exception(current_exception()); } catch (const exception& e) { SWARN("Uncaught exception in SThread: " << e.what()); p.set_exception(current_exception()); @@ -57,6 +69,9 @@ auto SThread(F&& f, Args&&... args) SWARN("Uncaught exception in SThread: unknown type"); p.set_exception(current_exception()); } + + // Restore non-recoverable state (belt-and-suspenders, thread is ending anyway). + SSetThreadRecoverable(false); } ); diff --git a/libstuff/libstuff.h b/libstuff/libstuff.h index dc39b00e1..72768473d 100644 --- a/libstuff/libstuff.h +++ b/libstuff/libstuff.h @@ -169,6 +169,41 @@ class SException : public exception { // Utility function for generating pretty callstacks. vector SGetCallstack(int depth = 0, void* const* callstack = nullptr) noexcept; +// An SSignalException is thrown when a recoverable thread (started via SThread) receives +// a signal like SIGSEGV or SIGFPE. Instead of crashing the process, the signal is converted +// into this exception which can be caught and handled gracefully. +class SSignalException : public exception { +private: + static const int CALLSTACK_LIMIT = 32; + int _signum; + void* _faultAddress; + void* _instructionPointer; + void* _callstack[CALLSTACK_LIMIT]; + int _depth; + mutable string _whatMessage; + +public: + SSignalException(int signum, + void* faultAddress, + void* instructionPointer, + void* const* callstack, + int depth); + + const char* what() const noexcept override; + + // Accessors + int signal() const noexcept { return _signum; } + void* faultAddress() const noexcept { return _faultAddress; } + void* instructionPointer() const noexcept { return _instructionPointer; } + + // Get demangled stack trace + vector stackTrace() const noexcept; + void logStackTrace() const noexcept; + + // Signal name for logging + const char* signalName() const noexcept; +}; + // -------------------------------------------------------------------------- // Time stuff TODO: Replace with chrono // -------------------------------------------------------------------------- @@ -235,6 +270,13 @@ void SClearSignals(); void SStopSignalThread(); +// Mark this thread as recoverable - signals like SIGSEGV/SIGFPE will be converted +// to SSignalException instead of aborting. Used by SThread. +void SSetThreadRecoverable(bool recoverable); + +// Check if this thread is marked as recoverable. +bool SIsThreadRecoverable(); + // And also exception stuff. string SGetCurrentExceptionName(); void STerminateHandler(void); diff --git a/test/clustertest/testplugin/TestPlugin.cpp b/test/clustertest/testplugin/TestPlugin.cpp index ccba49ae4..ab82f964f 100644 --- a/test/clustertest/testplugin/TestPlugin.cpp +++ b/test/clustertest/testplugin/TestPlugin.cpp @@ -111,6 +111,8 @@ unique_ptr BedrockPlugin_TestPlugin::getCommand(SQLiteCommand&& "testPostProcessTimeout", "EscalateSerializedData", "ThreadException", + "ThreadSIGSEGV", + "ThreadSIGFPE", "httpswait" }; for (auto& cmdName : supportedCommands) { @@ -248,6 +250,53 @@ bool TestPluginCommand::peek(SQLite& db) threadpair.second.get(); } + return true; + } else if (SStartsWith(request.methodLine, "ThreadSIGSEGV")) { + // Test SIGSEGV recovery in SThread. + auto threadpair = SThread([](){ + // Deliberately cause SIGSEGV by dereferencing null pointer. + int* ptr = nullptr; + *ptr = 42; + }); + + // Wait for the thread to finish. + threadpair.first.join(); + + // If rethrow is set, try to get the future result which should throw SSignalException. + if (request.isSet("rethrow")) { + try { + threadpair.second.get(); + } catch (const SSignalException& e) { + response.methodLine = "200 Caught SIGSEGV"; + return true; + } + } + + response.methodLine = "200 OK"; + return true; + } else if (SStartsWith(request.methodLine, "ThreadSIGFPE")) { + // Test SIGFPE recovery in SThread. + auto threadpair = SThread([](){ + // Deliberately cause SIGFPE by dividing by zero. + volatile int zero = 0; + volatile int result = 1 / zero; + (void)result; + }); + + // Wait for the thread to finish. + threadpair.first.join(); + + // If rethrow is set, try to get the future result which should throw SSignalException. + if (request.isSet("rethrow")) { + try { + threadpair.second.get(); + } catch (const SSignalException& e) { + response.methodLine = "200 Caught SIGFPE"; + return true; + } + } + + response.methodLine = "200 OK"; return true; } else if (SStartsWith(request.methodLine, "EscalateSerializedData")) { // Only set this if it's blank. The intention is that it will be blank on a follower, but already set by diff --git a/test/clustertest/tests/SignalExceptionTest.cpp b/test/clustertest/tests/SignalExceptionTest.cpp new file mode 100644 index 000000000..d85a43685 --- /dev/null +++ b/test/clustertest/tests/SignalExceptionTest.cpp @@ -0,0 +1,61 @@ +#include +#include + +struct SignalExceptionTest : tpunit::TestFixture +{ + SignalExceptionTest() + : tpunit::TestFixture("SignalException", + TEST(SignalExceptionTest::testThreadSIGSEGV), + TEST(SignalExceptionTest::testThreadSIGFPE), + TEST(SignalExceptionTest::testSocketThreadSIGSEGV)) + { + } + + // Test SIGSEGV in an SThread is caught and propagated + void testThreadSIGSEGV() + { + BedrockClusterTester tester; + + // Without rethrow - thread catches exception internally + SData command("ThreadSIGSEGV"); + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 OK"); + + // With rethrow - verify exception is caught as SSignalException + command["rethrow"] = "true"; + result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 Caught SIGSEGV"); + } + + // Test SIGFPE in an SThread is caught and propagated + void testThreadSIGFPE() + { + BedrockClusterTester tester; + + SData command("ThreadSIGFPE"); + command["rethrow"] = "true"; + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 Caught SIGFPE"); + } + + // Test SIGSEGV in socket thread (via generatesegfaultpeek) now recovers + // instead of crashing. Server should remain running. + void testSocketThreadSIGSEGV() + { + BedrockClusterTester tester; + + // This command triggers SIGSEGV in the socket handler thread. + // With signal recovery, the thread should exit gracefully and + // the server should continue running. + SData command("generatesegfaultpeek"); + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + + // The socket will close due to the exception, but server stays up. + // We might get a connection error or empty response. + // The key test is that the server is still alive: + + SData pingCommand("Status"); + SData pingResult = tester.getTester(0).executeWaitMultipleData({pingCommand})[0]; + ASSERT_EQUAL(pingResult.methodLine, "200 OK"); + } +} __SignalExceptionTest; From c801111be3334df49ba330d63f3e282f4ffacae7 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Thu, 29 Jan 2026 14:19:18 -0800 Subject: [PATCH 2/4] Replace deprecated ucontext with sigsetjmp/siglongjmp for signal recovery Switch from the deprecated ucontext_t API to standard POSIX sigsetjmp/siglongjmp for converting SIGSEGV/SIGFPE signals to exceptions in SThread. Changes: - Remove ucontext.h include and platform-specific ucontext helpers - Remove trampoline function (no longer needed) - Add sigjmp_buf thread-local storage for recovery point - Signal handler now calls siglongjmp() instead of modifying ucontext - SThread uses sigsetjmp() to establish recovery point - Add SGetRecoveryPoint(), SSetRecoveryPointActive(), SBuildSignalException() APIs Benefits: - Uses standard POSIX API instead of deprecated ucontext - Simpler implementation (no trampoline needed) - Works on any POSIX system, not just Linux x86_64/ARM64 Tradeoff: - Destructors between fault and recovery point won't run (acceptable since thread is exiting anyway) Co-Authored-By: Claude Opus 4.5 --- libstuff/SSignal.cpp | 115 +++++++++++++++---------------------------- libstuff/SThread.h | 22 +++++++++ libstuff/libstuff.h | 11 +++++ 3 files changed, 74 insertions(+), 74 deletions(-) diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index 87907d526..54a06a06a 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -9,10 +9,7 @@ #include #include -// ucontext.h is needed for signal recovery on Linux -#if defined(__linux__) -#include -#endif +// setjmp.h is included via libstuff.h for signal recovery thread_local function SSignalHandlerDieFunc; void SSetSignalHandlerDieFunc(function&& func) @@ -54,19 +51,21 @@ thread_local int _SSignal_threadCaughtSignalNumber = 0; // Used by SThread to enable graceful recovery from SIGSEGV/SIGFPE. thread_local bool _SSignal_threadIsRecoverable = false; -// Thread-local flag to prevent recursive recovery attempts (e.g., if trampoline itself crashes). -thread_local bool _SSignal_inTrampoline = false; +// Thread-local jump buffer for signal recovery via sigsetjmp/siglongjmp. +thread_local sigjmp_buf _SSignal_recoveryPoint; -// Thread-local storage for crash info to be passed to the trampoline. +// Thread-local flag indicating whether the recovery point has been set (sigsetjmp called). +thread_local bool _SSignal_recoveryPointSet = false; + +// Thread-local storage for crash info to be passed back after siglongjmp. struct SSignalCrashInfo { int signum; void* faultAddress; - void* instructionPointer; void* callstack[32]; int callstackDepth; bool hasCrashInfo; }; -thread_local SSignalCrashInfo _SSignal_crashInfo = {0, nullptr, nullptr, {}, 0, false}; +thread_local SSignalCrashInfo _SSignal_crashInfo = {0, nullptr, {}, 0, false}; // The number of termination signals received so far. atomic _SSignal_terminationCount(0); @@ -122,6 +121,29 @@ bool SIsThreadRecoverable() return _SSignal_threadIsRecoverable; } +sigjmp_buf* SGetRecoveryPoint() +{ + return &_SSignal_recoveryPoint; +} + +void SSetRecoveryPointActive(bool active) +{ + _SSignal_recoveryPointSet = active; +} + +SSignalException SBuildSignalException() +{ + SSignalException ex( + _SSignal_crashInfo.signum, + _SSignal_crashInfo.faultAddress, + nullptr, // No instruction pointer available with sigsetjmp approach + _SSignal_crashInfo.callstack, + _SSignal_crashInfo.callstackDepth + ); + _SSignal_crashInfo.hasCrashInfo = false; + return ex; +} + void SInitializeSignals() { // Our default die function does nothing. @@ -237,68 +259,18 @@ void SStopSignalThread() } } -// Platform-specific helpers for ucontext manipulation. -// These allow us to redirect execution after a signal to a trampoline function. -#if defined(__linux__) -static void _SSignal_SetInstructionPointer(ucontext_t* ctx, void* target) { - #if defined(__x86_64__) - ctx->uc_mcontext.gregs[REG_RIP] = (greg_t)target; - #elif defined(__aarch64__) - ctx->uc_mcontext.pc = (unsigned long)target; - #else - #error "Unsupported architecture for signal recovery" - #endif -} - -static void* _SSignal_GetInstructionPointer(ucontext_t* ctx) { - #if defined(__x86_64__) - return (void*)ctx->uc_mcontext.gregs[REG_RIP]; - #elif defined(__aarch64__) - return (void*)ctx->uc_mcontext.pc; - #else - return nullptr; - #endif -} -#endif - -// Trampoline function that runs after the signal handler returns. -// This executes on the original thread's stack and throws an SSignalException. -// The signal handler redirects execution here by modifying the instruction pointer in ucontext. -__attribute__((noinline, noreturn)) -void _SSignal_Trampoline() { - // Mark that we're in the trampoline to prevent recursive recovery attempts. - _SSignal_inTrampoline = true; - - // Copy crash info from TLS to stack before throwing. - SSignalCrashInfo info = _SSignal_crashInfo; - _SSignal_crashInfo.hasCrashInfo = false; - - // Throw the exception - this will unwind to the catch block in SThread. - throw SSignalException( - info.signum, - info.faultAddress, - info.instructionPointer, - info.callstack, - info.callstackDepth - ); -} - void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) { -#if defined(__linux__) && (defined(__x86_64__) || defined(__aarch64__)) - // Check if this is a recoverable signal in a recoverable thread. + // Check if this is a recoverable signal in a recoverable thread with a recovery point set. // SIGABRT is not recoverable - it's usually called intentionally or as a result of another crash. bool isRecoverableSignal = (signum == SIGSEGV || signum == SIGFPE || signum == SIGBUS || signum == SIGILL); - if (isRecoverableSignal && _SSignal_threadIsRecoverable && !_SSignal_inTrampoline && ucontext != nullptr) { - ucontext_t* ctx = static_cast(ucontext); - - // Store crash information in thread-local storage for the trampoline. + if (isRecoverableSignal && _SSignal_threadIsRecoverable && _SSignal_recoveryPointSet) { + // Store crash information in thread-local storage before jumping. _SSignal_crashInfo.signum = signum; _SSignal_crashInfo.faultAddress = info ? info->si_addr : nullptr; - _SSignal_crashInfo.instructionPointer = _SSignal_GetInstructionPointer(ctx); - // Capture backtrace while we have the original context. + // Capture backtrace while we're in the signal handler context. // Note: backtrace() is not strictly signal-safe but usually works. _SSignal_crashInfo.callstackDepth = backtrace( _SSignal_crashInfo.callstack, @@ -307,16 +279,13 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) _SSignal_crashInfo.hasCrashInfo = true; SWARN("Signal " << strsignal(signum) << "(" << signum << ") in recoverable thread, " - << "redirecting to exception trampoline. Fault address: " << (info ? info->si_addr : nullptr)); - - // Modify the instruction pointer to point to our trampoline. - // When the signal handler returns, execution will resume at the trampoline. - _SSignal_SetInstructionPointer(ctx, (void*)&_SSignal_Trampoline); + << "jumping to recovery point. Fault address: " << (info ? info->si_addr : nullptr)); - // Return from signal handler - execution will resume at trampoline on original thread stack. - return; + // Jump back to the recovery point set in SThread. + // The second argument becomes the return value of sigsetjmp. + siglongjmp(_SSignal_recoveryPoint, signum); + // Never reaches here } -#endif if (signum == SIGSEGV || signum == SIGABRT || signum == SIGFPE || signum == SIGILL || signum == SIGBUS) { // If we haven't already saved a signal number, we'll do it now. Any signal we catch here will generate a @@ -463,9 +432,7 @@ vector SSignalException::stackTrace() const noexcept { void SSignalException::logStackTrace() const noexcept { try { - SWARN("Signal " << signalName() << " at fault address " - << SToHex((uint64_t)_faultAddress) - << ", instruction " << SToHex((uint64_t)_instructionPointer)); + SWARN("Signal " << signalName() << " at fault address " << SToHex((uint64_t)_faultAddress)); for (const auto& frame : stackTrace()) { SWARN(" " << frame); } diff --git a/libstuff/SThread.h b/libstuff/SThread.h index edf1521b7..4403e81aa 100644 --- a/libstuff/SThread.h +++ b/libstuff/SThread.h @@ -46,6 +46,27 @@ auto SThread(F&& f, Args&&... args) SInitializeSignals(); SSetThreadRecoverable(true); + // Set up the recovery point for signal handling using sigsetjmp. + // If a signal occurs, the handler will call siglongjmp and sigsetjmp will + // "return" with the signal number instead of 0. + int signalCaught = sigsetjmp(*SGetRecoveryPoint(), 1); + SSetRecoveryPointActive(true); + + if (signalCaught != 0) { + // We got here via siglongjmp from the signal handler. + // signalCaught contains the signal number. + SSetRecoveryPointActive(false); + SSetThreadRecoverable(false); + + // Build the exception from the crash info stored by the signal handler. + SSignalException ex = SBuildSignalException(); + + SWARN("Signal exception in SThread: " << ex.what()); + ex.logStackTrace(); + p.set_exception(make_exception_ptr(ex)); + return; + } + try { // We call `apply` to use our argments from a tuple as if they were a list of discrete arguments. // This is effectively like calling `invoke` and passing the arguments separately. @@ -71,6 +92,7 @@ auto SThread(F&& f, Args&&... args) } // Restore non-recoverable state (belt-and-suspenders, thread is ending anyway). + SSetRecoveryPointActive(false); SSetThreadRecoverable(false); } ); diff --git a/libstuff/libstuff.h b/libstuff/libstuff.h index 72768473d..91a2b90bb 100644 --- a/libstuff/libstuff.h +++ b/libstuff/libstuff.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -277,6 +278,16 @@ void SSetThreadRecoverable(bool recoverable); // Check if this thread is marked as recoverable. bool SIsThreadRecoverable(); +// Get the thread-local recovery point for sigsetjmp/siglongjmp signal recovery. +sigjmp_buf* SGetRecoveryPoint(); + +// Set whether the recovery point is active (sigsetjmp has been called). +void SSetRecoveryPointActive(bool active); + +// Build an SSignalException from the current thread-local crash info. +// Called after siglongjmp returns to the recovery point. +SSignalException SBuildSignalException(); + // And also exception stuff. string SGetCurrentExceptionName(); void STerminateHandler(void); From b1fc4e63771ca157266931f850a46d84831e14dd Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 30 Jan 2026 00:03:25 +0000 Subject: [PATCH 3/4] Claude is probably not going to get this working, but it's a starting point --- BedrockServer.cpp | 20 +++++++-- BedrockServer.h | 4 ++ libstuff/SSignal.cpp | 20 ++++++--- libstuff/SThread.h | 48 +++++++++++++++++++--- test/clustertest/testplugin/TestPlugin.cpp | 7 ++-- 5 files changed, 79 insertions(+), 20 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 888a7817c..8a6f1760b 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -2208,14 +2208,18 @@ void BedrockServer::_acceptSockets() } // And start up this socket's thread. - // We use SThread to enable signal recovery - if a SIGSEGV occurs in the handler, + // We use SThreadWithCleanup to enable signal recovery - if a SIGSEGV occurs in the handler, // it will be converted to an exception and the thread will exit gracefully. + // The cleanup callback ensures _outstandingSocketThreads is decremented even when + // siglongjmp skips the normal cleanup code in handleSocket. _outstandingSocketThreads++; pair> threadPair; bool threadStarted = false; while (!threadStarted) { try { - threadPair = SThread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); + threadPair = SThreadWithCleanup( + [this]() { _socketThreadCleanup(); }, + &BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); threadStarted = true; } catch (const system_error& e) { // We don't care about this lock here from a performance perspective, it only happens when we @@ -2530,8 +2534,16 @@ void BedrockServer::handleSocket(Socket&& socket, bool fromControlPort, bool fro SALERT("handleSocket got unknown exception"); } - // At this point out socket is closed and we can clean up. - // Note that we never return early, we always want to hit this code and decrement our counter and clean up our socket. + // Cleanup is handled by SThreadWithCleanup callback - see _socketThreadCleanup(). + // This ensures cleanup happens even if a signal (SIGSEGV, etc.) causes siglongjmp + // to skip the normal function exit. +} + +void BedrockServer::_socketThreadCleanup() +{ + // Decrement our counter and check if we need to unblock new socket threads. + // This is called from SThreadWithCleanup to ensure it runs even if a signal + // causes siglongjmp to skip the normal handleSocket exit. _outstandingSocketThreads--; SINFO("[performance] Socket thread complete (" << _outstandingSocketThreads << " remaining)."); diff --git a/BedrockServer.h b/BedrockServer.h index 75773bbd7..afb5c4e1c 100644 --- a/BedrockServer.h +++ b/BedrockServer.h @@ -346,6 +346,10 @@ class BedrockServer : public SQLiteServer { // those ports. void _acceptSockets(); + // Cleanup callback for socket threads. Called from SThreadWithCleanup to ensure + // cleanup happens even if a signal causes siglongjmp to skip normal function exit. + void _socketThreadCleanup(); + // This stars the server shutting down. void _beginShutdown(const string& reason, bool detach = false); diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index 54a06a06a..a4b9781ef 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -173,11 +174,11 @@ void SInitializeSignals() // This is the signal action structure we'll use to specify what to listen for. struct sigaction newAction = {}; - // The old style handler is explicitly null - newAction.sa_handler = nullptr; - newAction.sa_flags = SA_ONSTACK; + // SA_SIGINFO is required to use sa_sigaction (three-argument handler) instead of sa_handler. + // SA_ONSTACK uses the alternate signal stack set up above. + newAction.sa_flags = SA_SIGINFO | SA_ONSTACK; - // The new style handler is _SSignal_StackTrace. + // The three-argument signal handler that receives siginfo_t with fault details. newAction.sa_sigaction = &_SSignal_StackTrace; // While we're inside the signal handler, we want to block any other signals from occurring until we return. @@ -265,6 +266,11 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) // SIGABRT is not recoverable - it's usually called intentionally or as a result of another crash. bool isRecoverableSignal = (signum == SIGSEGV || signum == SIGFPE || signum == SIGBUS || signum == SIGILL); + // Debug: log recovery state + SWARN("Signal handler: tid=" << syscall(SYS_gettid) << " signum=" << signum << " isRecoverable=" << isRecoverableSignal + << " threadRecoverable=" << _SSignal_threadIsRecoverable + << " recoveryPointSet=" << _SSignal_recoveryPointSet); + if (isRecoverableSignal && _SSignal_threadIsRecoverable && _SSignal_recoveryPointSet) { // Store crash information in thread-local storage before jumping. _SSignal_crashInfo.signum = signum; @@ -345,7 +351,8 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) SWARN(fullLogLine); if (fd != -1) { fullLogLine = format("{}{}", fullLogLine, "\n"); - write(fd, fullLogLine.c_str(), strlen(fullLogLine.c_str())); + // Ignore return value - we're in a signal handler and can't do much if write fails. + [[maybe_unused]] auto _ = write(fd, fullLogLine.c_str(), strlen(fullLogLine.c_str())); } free(frame); } @@ -366,7 +373,8 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) // Finish writing the crash file with the request details if it exists if (fd != -1 && !logMessage.empty()) { logMessage += "\n"; - write(fd, logMessage.c_str(), strlen(logMessage.c_str())); + // Ignore return value - we're in a signal handler and can't do much if write fails. + [[maybe_unused]] auto __ = write(fd, logMessage.c_str(), strlen(logMessage.c_str())); } close(fd); diff --git a/libstuff/SThread.h b/libstuff/SThread.h index 4403e81aa..9f94c8053 100644 --- a/libstuff/SThread.h +++ b/libstuff/SThread.h @@ -5,21 +5,28 @@ #include #include #include +#include +#include #include "libstuff.h" using namespace std; -// SThread is a thread wrapper intended to be used in the same way as thread, -// except that it will trap exceptions (including signal-generated exceptions like SIGSEGV) -// and pass them back to the caller as part of a promise. -template -auto SThread(F&& f, Args&&... args) +// Internal implementation of SThread with optional cleanup callback. +// The cleanup callback is called at thread exit regardless of how the thread terminates: +// - Normal completion +// - Exception thrown +// - Signal caught via siglongjmp (SIGSEGV, SIGFPE, etc.) +// This is critical because siglongjmp does not unwind the stack, so cleanup code in the +// user's function will not run in the signal case. +template +auto _SThreadImpl(Cleanup&& cleanup, F&& f, Args&&... args) { // Create type aliases for the function, argument list, and return types. // These are decayed as per decay (https://en.cppreference.com/w/cpp/types/decay.html) // which makes the same sort of type conversions that the compiler makes when passign by value. using Fn = decay_t; + using CleanupFn = decay_t; // We create a tuple from the passed args to allow passing variadic arguments to our lambda below. using DecayedArgsTuple = tuple ...>; @@ -35,22 +42,25 @@ auto SThread(F&& f, Args&&... args) // Now we can create the callable function and it's arguments that we will pass to our lambda. Fn fn(forward(f)); + CleanupFn cleanupFn(forward(cleanup)); DecayedArgsTuple argTuple(forward(args)...); // Finally we can create our new thread and pass it our function and arguments. thread t( - [p = move(prom), fn = move(fn), argTuple = move(argTuple)]() mutable { + [p = move(prom), fn = move(fn), cleanupFn = move(cleanupFn), argTuple = move(argTuple)]() mutable { // Initialize signal handling for this thread and mark it as recoverable. // This allows signals like SIGSEGV/SIGFPE to be converted to SSignalException // instead of aborting the process. SInitializeSignals(); SSetThreadRecoverable(true); + SINFO("SThread: tid=" << syscall(SYS_gettid) << " Set threadRecoverable=true, isRecoverable=" << SIsThreadRecoverable()); // Set up the recovery point for signal handling using sigsetjmp. // If a signal occurs, the handler will call siglongjmp and sigsetjmp will // "return" with the signal number instead of 0. int signalCaught = sigsetjmp(*SGetRecoveryPoint(), 1); SSetRecoveryPointActive(true); + SINFO("SThread: tid=" << syscall(SYS_gettid) << " Set recoveryPointActive=true after sigsetjmp"); if (signalCaught != 0) { // We got here via siglongjmp from the signal handler. @@ -64,6 +74,10 @@ auto SThread(F&& f, Args&&... args) SWARN("Signal exception in SThread: " << ex.what()); ex.logStackTrace(); p.set_exception(make_exception_ptr(ex)); + + // Call cleanup before returning - this is critical because siglongjmp + // skips any cleanup code in the user's function. + cleanupFn(); return; } @@ -94,6 +108,9 @@ auto SThread(F&& f, Args&&... args) // Restore non-recoverable state (belt-and-suspenders, thread is ending anyway). SSetRecoveryPointActive(false); SSetThreadRecoverable(false); + + // Call cleanup at thread exit. + cleanupFn(); } ); @@ -101,3 +118,22 @@ auto SThread(F&& f, Args&&... args) // for it to complete, and also the future, so that the caller can check if there were any exceptions. return make_pair(move(t), move(fut)); } + +// SThread is a thread wrapper intended to be used in the same way as thread, +// except that it will trap exceptions (including signal-generated exceptions like SIGSEGV) +// and pass them back to the caller as part of a promise. +template +auto SThread(F&& f, Args&&... args) +{ + return _SThreadImpl([](){}, forward(f), forward(args)...); +} + +// SThreadWithCleanup is like SThread but takes a cleanup callback that is guaranteed +// to be called when the thread exits, even if the thread is terminated by a signal +// (SIGSEGV, SIGFPE, etc.). This is necessary because siglongjmp does not unwind +// the stack, so any cleanup code in the user's function would be skipped. +template +auto SThreadWithCleanup(Cleanup&& cleanup, F&& f, Args&&... args) +{ + return _SThreadImpl(forward(cleanup), forward(f), forward(args)...); +} diff --git a/test/clustertest/testplugin/TestPlugin.cpp b/test/clustertest/testplugin/TestPlugin.cpp index ab82f964f..88fbede8b 100644 --- a/test/clustertest/testplugin/TestPlugin.cpp +++ b/test/clustertest/testplugin/TestPlugin.cpp @@ -1,5 +1,6 @@ #include "TestPlugin.h" +#include #include #include #include @@ -277,10 +278,8 @@ bool TestPluginCommand::peek(SQLite& db) } else if (SStartsWith(request.methodLine, "ThreadSIGFPE")) { // Test SIGFPE recovery in SThread. auto threadpair = SThread([](){ - // Deliberately cause SIGFPE by dividing by zero. - volatile int zero = 0; - volatile int result = 1 / zero; - (void)result; + // Raise SIGFPE explicitly - division by zero doesn't reliably raise SIGFPE on all systems. + raise(SIGFPE); }); // Wait for the thread to finish. From 4416802eaefafaa305ce481ecba74b185f473610 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 30 Jan 2026 01:08:09 +0000 Subject: [PATCH 4/4] Style --- BedrockServer.cpp | 4 ++- libstuff/SSignal.cpp | 29 ++++++++++++------- libstuff/SThread.h | 3 +- libstuff/libstuff.h | 17 +++++++++-- .../clustertest/tests/SignalExceptionTest.cpp | 6 ++-- 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 8a6f1760b..930dc798e 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -2218,7 +2218,9 @@ void BedrockServer::_acceptSockets() while (!threadStarted) { try { threadPair = SThreadWithCleanup( - [this]() { _socketThreadCleanup(); }, + [this]() { + _socketThreadCleanup(); + }, &BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); threadStarted = true; } catch (const system_error& e) { diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index a4b9781ef..621853d07 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -59,7 +59,8 @@ thread_local sigjmp_buf _SSignal_recoveryPoint; thread_local bool _SSignal_recoveryPointSet = false; // Thread-local storage for crash info to be passed back after siglongjmp. -struct SSignalCrashInfo { +struct SSignalCrashInfo +{ int signum; void* faultAddress; void* callstack[32]; @@ -403,20 +404,21 @@ SSignalException::SSignalException(int signum, void* const* callstack, int depth) : _signum(signum), - _faultAddress(faultAddress), - _instructionPointer(instructionPointer), - _depth(min(depth, CALLSTACK_LIMIT)) + _faultAddress(faultAddress), + _instructionPointer(instructionPointer), + _depth(min(depth, CALLSTACK_LIMIT)) { if (callstack && _depth > 0) { memcpy(_callstack, callstack, _depth * sizeof(void*)); } } -const char* SSignalException::what() const noexcept { +const char* SSignalException::what() const noexcept +{ if (_whatMessage.empty()) { // Build message lazily to avoid issues in constructor context. try { - _whatMessage = "Signal " + string(signalName()) + " at address " + SToHex((uint64_t)_faultAddress); + _whatMessage = "Signal " + string(signalName()) + " at address " + SToHex((uint64_t) _faultAddress); } catch (...) { _whatMessage = "Signal exception"; } @@ -424,23 +426,30 @@ const char* SSignalException::what() const noexcept { return _whatMessage.c_str(); } -const char* SSignalException::signalName() const noexcept { +const char* SSignalException::signalName() const noexcept +{ switch (_signum) { case SIGSEGV: return "SIGSEGV"; + case SIGFPE: return "SIGFPE"; + case SIGBUS: return "SIGBUS"; + case SIGILL: return "SIGILL"; + default: return "UNKNOWN"; } } -vector SSignalException::stackTrace() const noexcept { +vector SSignalException::stackTrace() const noexcept +{ return SGetCallstack(_depth, _callstack); } -void SSignalException::logStackTrace() const noexcept { +void SSignalException::logStackTrace() const noexcept +{ try { - SWARN("Signal " << signalName() << " at fault address " << SToHex((uint64_t)_faultAddress)); + SWARN("Signal " << signalName() << " at fault address " << SToHex((uint64_t) _faultAddress)); for (const auto& frame : stackTrace()) { SWARN(" " << frame); } diff --git a/libstuff/SThread.h b/libstuff/SThread.h index 9f94c8053..6e9376183 100644 --- a/libstuff/SThread.h +++ b/libstuff/SThread.h @@ -125,7 +125,8 @@ auto _SThreadImpl(Cleanup&& cleanup, F&& f, Args&&... args) template auto SThread(F&& f, Args&&... args) { - return _SThreadImpl([](){}, forward(f), forward(args)...); + return _SThreadImpl([](){ + }, forward(f), forward(args)...); } // SThreadWithCleanup is like SThread but takes a cleanup callback that is guaranteed diff --git a/libstuff/libstuff.h b/libstuff/libstuff.h index 91a2b90bb..295d040bc 100644 --- a/libstuff/libstuff.h +++ b/libstuff/libstuff.h @@ -193,9 +193,20 @@ class SSignalException : public exception { const char* what() const noexcept override; // Accessors - int signal() const noexcept { return _signum; } - void* faultAddress() const noexcept { return _faultAddress; } - void* instructionPointer() const noexcept { return _instructionPointer; } + int signal() const noexcept + { + return _signum; + } + + void* faultAddress() const noexcept + { + return _faultAddress; + } + + void* instructionPointer() const noexcept + { + return _instructionPointer; + } // Get demangled stack trace vector stackTrace() const noexcept; diff --git a/test/clustertest/tests/SignalExceptionTest.cpp b/test/clustertest/tests/SignalExceptionTest.cpp index d85a43685..d2beab309 100644 --- a/test/clustertest/tests/SignalExceptionTest.cpp +++ b/test/clustertest/tests/SignalExceptionTest.cpp @@ -5,9 +5,9 @@ struct SignalExceptionTest : tpunit::TestFixture { SignalExceptionTest() : tpunit::TestFixture("SignalException", - TEST(SignalExceptionTest::testThreadSIGSEGV), - TEST(SignalExceptionTest::testThreadSIGFPE), - TEST(SignalExceptionTest::testSocketThreadSIGSEGV)) + TEST(SignalExceptionTest::testThreadSIGSEGV), + TEST(SignalExceptionTest::testThreadSIGFPE), + TEST(SignalExceptionTest::testSocketThreadSIGSEGV)) { }