diff --git a/BedrockServer.cpp b/BedrockServer.cpp index b56cc80bd..930dc798e 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -2207,12 +2208,20 @@ void BedrockServer::_acceptSockets() } // And start up this socket's thread. + // We use SThreadWithCleanup to enable signal recovery - if a SIGSEGV occurs in the handler, + // it will be converted to an exception and the thread will exit gracefully. + // The cleanup callback ensures _outstandingSocketThreads is decremented even when + // siglongjmp skips the normal cleanup code in handleSocket. _outstandingSocketThreads++; - thread t; + pair> threadPair; bool threadStarted = false; while (!threadStarted) { try { - t = thread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); + threadPair = SThreadWithCleanup( + [this]() { + _socketThreadCleanup(); + }, + &BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate); threadStarted = true; } catch (const system_error& e) { // We don't care about this lock here from a performance perspective, it only happens when we @@ -2246,7 +2255,11 @@ void BedrockServer::_acceptSockets() } } try { - t.detach(); + // Detach the thread - it will run independently. + // The future (threadPair.second) is discarded. If handleSocket catches the + // SSignalException (it does), the future sees normal completion. If it doesn't + // catch, the exception is stored in the future but never retrieved. + threadPair.first.detach(); } catch (const system_error& e) { SALERT("Caught system_error in thread detach: " << e.code() << ", message: " << e.what()); throw; @@ -2513,14 +2526,26 @@ void BedrockServer::handleSocket(Socket&& socket, bool fromControlPort, bool fro SWARN("Socket in unhandled state: " << socket.state); } } + } catch (const SSignalException& e) { + // Signal-generated exception (SIGSEGV, SIGFPE, etc.) - log with full stack trace. + SALERT("handleSocket caught signal " << e.signalName() << " at " << e.faultAddress()); + e.logStackTrace(); } catch (const exception& e) { SALERT("handleSocket got exception: " << e.what()); } catch (...) { SALERT("handleSocket got unknown exception"); } - // At this point out socket is closed and we can clean up. - // Note that we never return early, we always want to hit this code and decrement our counter and clean up our socket. + // Cleanup is handled by SThreadWithCleanup callback - see _socketThreadCleanup(). + // This ensures cleanup happens even if a signal (SIGSEGV, etc.) causes siglongjmp + // to skip the normal function exit. +} + +void BedrockServer::_socketThreadCleanup() +{ + // Decrement our counter and check if we need to unblock new socket threads. + // This is called from SThreadWithCleanup to ensure it runs even if a signal + // causes siglongjmp to skip the normal handleSocket exit. _outstandingSocketThreads--; SINFO("[performance] Socket thread complete (" << _outstandingSocketThreads << " remaining)."); diff --git a/BedrockServer.h b/BedrockServer.h index 75773bbd7..afb5c4e1c 100644 --- a/BedrockServer.h +++ b/BedrockServer.h @@ -346,6 +346,10 @@ class BedrockServer : public SQLiteServer { // those ports. void _acceptSockets(); + // Cleanup callback for socket threads. Called from SThreadWithCleanup to ensure + // cleanup happens even if a signal causes siglongjmp to skip normal function exit. + void _socketThreadCleanup(); + // This stars the server shutting down. void _beginShutdown(const string& reason, bool detach = false); diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index 763ab0f69..621853d07 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -6,9 +6,12 @@ #include #include #include +#include #include #include +// setjmp.h is included via libstuff.h for signal recovery + thread_local function SSignalHandlerDieFunc; void SSetSignalHandlerDieFunc(function&& func) { @@ -45,6 +48,27 @@ thread _SSignal_signalThread; // `abort()`, this records the original signal number until the signal handler for abort has a chance to log it. thread_local int _SSignal_threadCaughtSignalNumber = 0; +// Thread-local flag indicating this thread should convert signals to exceptions instead of aborting. +// Used by SThread to enable graceful recovery from SIGSEGV/SIGFPE. +thread_local bool _SSignal_threadIsRecoverable = false; + +// Thread-local jump buffer for signal recovery via sigsetjmp/siglongjmp. +thread_local sigjmp_buf _SSignal_recoveryPoint; + +// Thread-local flag indicating whether the recovery point has been set (sigsetjmp called). +thread_local bool _SSignal_recoveryPointSet = false; + +// Thread-local storage for crash info to be passed back after siglongjmp. +struct SSignalCrashInfo +{ + int signum; + void* faultAddress; + void* callstack[32]; + int callstackDepth; + bool hasCrashInfo; +}; +thread_local SSignalCrashInfo _SSignal_crashInfo = {0, nullptr, {}, 0, false}; + // The number of termination signals received so far. atomic _SSignal_terminationCount(0); uint64_t STerminationSignalCount() @@ -89,6 +113,39 @@ void SClearSignals() _SSignal_pendingSignalBitMask.store(0); } +void SSetThreadRecoverable(bool recoverable) +{ + _SSignal_threadIsRecoverable = recoverable; +} + +bool SIsThreadRecoverable() +{ + return _SSignal_threadIsRecoverable; +} + +sigjmp_buf* SGetRecoveryPoint() +{ + return &_SSignal_recoveryPoint; +} + +void SSetRecoveryPointActive(bool active) +{ + _SSignal_recoveryPointSet = active; +} + +SSignalException SBuildSignalException() +{ + SSignalException ex( + _SSignal_crashInfo.signum, + _SSignal_crashInfo.faultAddress, + nullptr, // No instruction pointer available with sigsetjmp approach + _SSignal_crashInfo.callstack, + _SSignal_crashInfo.callstackDepth + ); + _SSignal_crashInfo.hasCrashInfo = false; + return ex; +} + void SInitializeSignals() { // Our default die function does nothing. @@ -118,11 +175,11 @@ void SInitializeSignals() // This is the signal action structure we'll use to specify what to listen for. struct sigaction newAction = {}; - // The old style handler is explicitly null - newAction.sa_handler = nullptr; - newAction.sa_flags = SA_ONSTACK; + // SA_SIGINFO is required to use sa_sigaction (three-argument handler) instead of sa_handler. + // SA_ONSTACK uses the alternate signal stack set up above. + newAction.sa_flags = SA_SIGINFO | SA_ONSTACK; - // The new style handler is _SSignal_StackTrace. + // The three-argument signal handler that receives siginfo_t with fault details. newAction.sa_sigaction = &_SSignal_StackTrace; // While we're inside the signal handler, we want to block any other signals from occurring until we return. @@ -206,6 +263,37 @@ void SStopSignalThread() void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) { + // Check if this is a recoverable signal in a recoverable thread with a recovery point set. + // SIGABRT is not recoverable - it's usually called intentionally or as a result of another crash. + bool isRecoverableSignal = (signum == SIGSEGV || signum == SIGFPE || signum == SIGBUS || signum == SIGILL); + + // Debug: log recovery state + SWARN("Signal handler: tid=" << syscall(SYS_gettid) << " signum=" << signum << " isRecoverable=" << isRecoverableSignal + << " threadRecoverable=" << _SSignal_threadIsRecoverable + << " recoveryPointSet=" << _SSignal_recoveryPointSet); + + if (isRecoverableSignal && _SSignal_threadIsRecoverable && _SSignal_recoveryPointSet) { + // Store crash information in thread-local storage before jumping. + _SSignal_crashInfo.signum = signum; + _SSignal_crashInfo.faultAddress = info ? info->si_addr : nullptr; + + // Capture backtrace while we're in the signal handler context. + // Note: backtrace() is not strictly signal-safe but usually works. + _SSignal_crashInfo.callstackDepth = backtrace( + _SSignal_crashInfo.callstack, + sizeof(_SSignal_crashInfo.callstack) / sizeof(void*) + ); + _SSignal_crashInfo.hasCrashInfo = true; + + SWARN("Signal " << strsignal(signum) << "(" << signum << ") in recoverable thread, " + << "jumping to recovery point. Fault address: " << (info ? info->si_addr : nullptr)); + + // Jump back to the recovery point set in SThread. + // The second argument becomes the return value of sigsetjmp. + siglongjmp(_SSignal_recoveryPoint, signum); + // Never reaches here + } + if (signum == SIGSEGV || signum == SIGABRT || signum == SIGFPE || signum == SIGILL || signum == SIGBUS) { // If we haven't already saved a signal number, we'll do it now. Any signal we catch here will generate a // second ABORT signal, and we don't want that to overwrite this value, so we only set it if unset. @@ -264,7 +352,8 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) SWARN(fullLogLine); if (fd != -1) { fullLogLine = format("{}{}", fullLogLine, "\n"); - write(fd, fullLogLine.c_str(), strlen(fullLogLine.c_str())); + // Ignore return value - we're in a signal handler and can't do much if write fails. + [[maybe_unused]] auto _ = write(fd, fullLogLine.c_str(), strlen(fullLogLine.c_str())); } free(frame); } @@ -285,7 +374,8 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) // Finish writing the crash file with the request details if it exists if (fd != -1 && !logMessage.empty()) { logMessage += "\n"; - write(fd, logMessage.c_str(), strlen(logMessage.c_str())); + // Ignore return value - we're in a signal handler and can't do much if write fails. + [[maybe_unused]] auto __ = write(fd, logMessage.c_str(), strlen(logMessage.c_str())); } close(fd); @@ -306,3 +396,64 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext) SALERT("Non-signal thread got signal " << strsignal(signum) << "(" << signum << "), which wasn't expected"); } } + +// SSignalException implementation +SSignalException::SSignalException(int signum, + void* faultAddress, + void* instructionPointer, + void* const* callstack, + int depth) + : _signum(signum), + _faultAddress(faultAddress), + _instructionPointer(instructionPointer), + _depth(min(depth, CALLSTACK_LIMIT)) +{ + if (callstack && _depth > 0) { + memcpy(_callstack, callstack, _depth * sizeof(void*)); + } +} + +const char* SSignalException::what() const noexcept +{ + if (_whatMessage.empty()) { + // Build message lazily to avoid issues in constructor context. + try { + _whatMessage = "Signal " + string(signalName()) + " at address " + SToHex((uint64_t) _faultAddress); + } catch (...) { + _whatMessage = "Signal exception"; + } + } + return _whatMessage.c_str(); +} + +const char* SSignalException::signalName() const noexcept +{ + switch (_signum) { + case SIGSEGV: return "SIGSEGV"; + + case SIGFPE: return "SIGFPE"; + + case SIGBUS: return "SIGBUS"; + + case SIGILL: return "SIGILL"; + + default: return "UNKNOWN"; + } +} + +vector SSignalException::stackTrace() const noexcept +{ + return SGetCallstack(_depth, _callstack); +} + +void SSignalException::logStackTrace() const noexcept +{ + try { + SWARN("Signal " << signalName() << " at fault address " << SToHex((uint64_t) _faultAddress)); + for (const auto& frame : stackTrace()) { + SWARN(" " << frame); + } + } catch (...) { + // Logging failed, but we're noexcept so just ignore. + } +} diff --git a/libstuff/SThread.h b/libstuff/SThread.h index 83fde8b69..6e9376183 100644 --- a/libstuff/SThread.h +++ b/libstuff/SThread.h @@ -5,20 +5,28 @@ #include #include #include +#include +#include #include "libstuff.h" using namespace std; -// SThread is a thread wrapper intended to be used in the same way as thread, -// except that it will trap exceptions and pass them back to the caller as part of a promise. -template -auto SThread(F&& f, Args&&... args) +// Internal implementation of SThread with optional cleanup callback. +// The cleanup callback is called at thread exit regardless of how the thread terminates: +// - Normal completion +// - Exception thrown +// - Signal caught via siglongjmp (SIGSEGV, SIGFPE, etc.) +// This is critical because siglongjmp does not unwind the stack, so cleanup code in the +// user's function will not run in the signal case. +template +auto _SThreadImpl(Cleanup&& cleanup, F&& f, Args&&... args) { // Create type aliases for the function, argument list, and return types. // These are decayed as per decay (https://en.cppreference.com/w/cpp/types/decay.html) // which makes the same sort of type conversions that the compiler makes when passign by value. using Fn = decay_t; + using CleanupFn = decay_t; // We create a tuple from the passed args to allow passing variadic arguments to our lambda below. using DecayedArgsTuple = tuple ...>; @@ -34,11 +42,45 @@ auto SThread(F&& f, Args&&... args) // Now we can create the callable function and it's arguments that we will pass to our lambda. Fn fn(forward(f)); + CleanupFn cleanupFn(forward(cleanup)); DecayedArgsTuple argTuple(forward(args)...); // Finally we can create our new thread and pass it our function and arguments. thread t( - [p = move(prom), fn = move(fn), argTuple = move(argTuple)]() mutable { + [p = move(prom), fn = move(fn), cleanupFn = move(cleanupFn), argTuple = move(argTuple)]() mutable { + // Initialize signal handling for this thread and mark it as recoverable. + // This allows signals like SIGSEGV/SIGFPE to be converted to SSignalException + // instead of aborting the process. + SInitializeSignals(); + SSetThreadRecoverable(true); + SINFO("SThread: tid=" << syscall(SYS_gettid) << " Set threadRecoverable=true, isRecoverable=" << SIsThreadRecoverable()); + + // Set up the recovery point for signal handling using sigsetjmp. + // If a signal occurs, the handler will call siglongjmp and sigsetjmp will + // "return" with the signal number instead of 0. + int signalCaught = sigsetjmp(*SGetRecoveryPoint(), 1); + SSetRecoveryPointActive(true); + SINFO("SThread: tid=" << syscall(SYS_gettid) << " Set recoveryPointActive=true after sigsetjmp"); + + if (signalCaught != 0) { + // We got here via siglongjmp from the signal handler. + // signalCaught contains the signal number. + SSetRecoveryPointActive(false); + SSetThreadRecoverable(false); + + // Build the exception from the crash info stored by the signal handler. + SSignalException ex = SBuildSignalException(); + + SWARN("Signal exception in SThread: " << ex.what()); + ex.logStackTrace(); + p.set_exception(make_exception_ptr(ex)); + + // Call cleanup before returning - this is critical because siglongjmp + // skips any cleanup code in the user's function. + cleanupFn(); + return; + } + try { // We call `apply` to use our argments from a tuple as if they were a list of discrete arguments. // This is effectively like calling `invoke` and passing the arguments separately. @@ -50,6 +92,11 @@ auto SThread(F&& f, Args&&... args) } else { p.set_value(apply(move(fn), move(argTuple))); } + } catch (const SSignalException& e) { + // Signal-generated exception - log stack trace before propagating. + SWARN("Signal exception in SThread: " << e.what()); + e.logStackTrace(); + p.set_exception(current_exception()); } catch (const exception& e) { SWARN("Uncaught exception in SThread: " << e.what()); p.set_exception(current_exception()); @@ -57,6 +104,13 @@ auto SThread(F&& f, Args&&... args) SWARN("Uncaught exception in SThread: unknown type"); p.set_exception(current_exception()); } + + // Restore non-recoverable state (belt-and-suspenders, thread is ending anyway). + SSetRecoveryPointActive(false); + SSetThreadRecoverable(false); + + // Call cleanup at thread exit. + cleanupFn(); } ); @@ -64,3 +118,23 @@ auto SThread(F&& f, Args&&... args) // for it to complete, and also the future, so that the caller can check if there were any exceptions. return make_pair(move(t), move(fut)); } + +// SThread is a thread wrapper intended to be used in the same way as thread, +// except that it will trap exceptions (including signal-generated exceptions like SIGSEGV) +// and pass them back to the caller as part of a promise. +template +auto SThread(F&& f, Args&&... args) +{ + return _SThreadImpl([](){ + }, forward(f), forward(args)...); +} + +// SThreadWithCleanup is like SThread but takes a cleanup callback that is guaranteed +// to be called when the thread exits, even if the thread is terminated by a signal +// (SIGSEGV, SIGFPE, etc.). This is necessary because siglongjmp does not unwind +// the stack, so any cleanup code in the user's function would be skipped. +template +auto SThreadWithCleanup(Cleanup&& cleanup, F&& f, Args&&... args) +{ + return _SThreadImpl(forward(cleanup), forward(f), forward(args)...); +} diff --git a/libstuff/libstuff.h b/libstuff/libstuff.h index dc39b00e1..295d040bc 100644 --- a/libstuff/libstuff.h +++ b/libstuff/libstuff.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -169,6 +170,52 @@ class SException : public exception { // Utility function for generating pretty callstacks. vector SGetCallstack(int depth = 0, void* const* callstack = nullptr) noexcept; +// An SSignalException is thrown when a recoverable thread (started via SThread) receives +// a signal like SIGSEGV or SIGFPE. Instead of crashing the process, the signal is converted +// into this exception which can be caught and handled gracefully. +class SSignalException : public exception { +private: + static const int CALLSTACK_LIMIT = 32; + int _signum; + void* _faultAddress; + void* _instructionPointer; + void* _callstack[CALLSTACK_LIMIT]; + int _depth; + mutable string _whatMessage; + +public: + SSignalException(int signum, + void* faultAddress, + void* instructionPointer, + void* const* callstack, + int depth); + + const char* what() const noexcept override; + + // Accessors + int signal() const noexcept + { + return _signum; + } + + void* faultAddress() const noexcept + { + return _faultAddress; + } + + void* instructionPointer() const noexcept + { + return _instructionPointer; + } + + // Get demangled stack trace + vector stackTrace() const noexcept; + void logStackTrace() const noexcept; + + // Signal name for logging + const char* signalName() const noexcept; +}; + // -------------------------------------------------------------------------- // Time stuff TODO: Replace with chrono // -------------------------------------------------------------------------- @@ -235,6 +282,23 @@ void SClearSignals(); void SStopSignalThread(); +// Mark this thread as recoverable - signals like SIGSEGV/SIGFPE will be converted +// to SSignalException instead of aborting. Used by SThread. +void SSetThreadRecoverable(bool recoverable); + +// Check if this thread is marked as recoverable. +bool SIsThreadRecoverable(); + +// Get the thread-local recovery point for sigsetjmp/siglongjmp signal recovery. +sigjmp_buf* SGetRecoveryPoint(); + +// Set whether the recovery point is active (sigsetjmp has been called). +void SSetRecoveryPointActive(bool active); + +// Build an SSignalException from the current thread-local crash info. +// Called after siglongjmp returns to the recovery point. +SSignalException SBuildSignalException(); + // And also exception stuff. string SGetCurrentExceptionName(); void STerminateHandler(void); diff --git a/test/clustertest/testplugin/TestPlugin.cpp b/test/clustertest/testplugin/TestPlugin.cpp index ccba49ae4..88fbede8b 100644 --- a/test/clustertest/testplugin/TestPlugin.cpp +++ b/test/clustertest/testplugin/TestPlugin.cpp @@ -1,5 +1,6 @@ #include "TestPlugin.h" +#include #include #include #include @@ -111,6 +112,8 @@ unique_ptr BedrockPlugin_TestPlugin::getCommand(SQLiteCommand&& "testPostProcessTimeout", "EscalateSerializedData", "ThreadException", + "ThreadSIGSEGV", + "ThreadSIGFPE", "httpswait" }; for (auto& cmdName : supportedCommands) { @@ -248,6 +251,51 @@ bool TestPluginCommand::peek(SQLite& db) threadpair.second.get(); } + return true; + } else if (SStartsWith(request.methodLine, "ThreadSIGSEGV")) { + // Test SIGSEGV recovery in SThread. + auto threadpair = SThread([](){ + // Deliberately cause SIGSEGV by dereferencing null pointer. + int* ptr = nullptr; + *ptr = 42; + }); + + // Wait for the thread to finish. + threadpair.first.join(); + + // If rethrow is set, try to get the future result which should throw SSignalException. + if (request.isSet("rethrow")) { + try { + threadpair.second.get(); + } catch (const SSignalException& e) { + response.methodLine = "200 Caught SIGSEGV"; + return true; + } + } + + response.methodLine = "200 OK"; + return true; + } else if (SStartsWith(request.methodLine, "ThreadSIGFPE")) { + // Test SIGFPE recovery in SThread. + auto threadpair = SThread([](){ + // Raise SIGFPE explicitly - division by zero doesn't reliably raise SIGFPE on all systems. + raise(SIGFPE); + }); + + // Wait for the thread to finish. + threadpair.first.join(); + + // If rethrow is set, try to get the future result which should throw SSignalException. + if (request.isSet("rethrow")) { + try { + threadpair.second.get(); + } catch (const SSignalException& e) { + response.methodLine = "200 Caught SIGFPE"; + return true; + } + } + + response.methodLine = "200 OK"; return true; } else if (SStartsWith(request.methodLine, "EscalateSerializedData")) { // Only set this if it's blank. The intention is that it will be blank on a follower, but already set by diff --git a/test/clustertest/tests/SignalExceptionTest.cpp b/test/clustertest/tests/SignalExceptionTest.cpp new file mode 100644 index 000000000..d2beab309 --- /dev/null +++ b/test/clustertest/tests/SignalExceptionTest.cpp @@ -0,0 +1,61 @@ +#include +#include + +struct SignalExceptionTest : tpunit::TestFixture +{ + SignalExceptionTest() + : tpunit::TestFixture("SignalException", + TEST(SignalExceptionTest::testThreadSIGSEGV), + TEST(SignalExceptionTest::testThreadSIGFPE), + TEST(SignalExceptionTest::testSocketThreadSIGSEGV)) + { + } + + // Test SIGSEGV in an SThread is caught and propagated + void testThreadSIGSEGV() + { + BedrockClusterTester tester; + + // Without rethrow - thread catches exception internally + SData command("ThreadSIGSEGV"); + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 OK"); + + // With rethrow - verify exception is caught as SSignalException + command["rethrow"] = "true"; + result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 Caught SIGSEGV"); + } + + // Test SIGFPE in an SThread is caught and propagated + void testThreadSIGFPE() + { + BedrockClusterTester tester; + + SData command("ThreadSIGFPE"); + command["rethrow"] = "true"; + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + ASSERT_EQUAL(result.methodLine, "200 Caught SIGFPE"); + } + + // Test SIGSEGV in socket thread (via generatesegfaultpeek) now recovers + // instead of crashing. Server should remain running. + void testSocketThreadSIGSEGV() + { + BedrockClusterTester tester; + + // This command triggers SIGSEGV in the socket handler thread. + // With signal recovery, the thread should exit gracefully and + // the server should continue running. + SData command("generatesegfaultpeek"); + SData result = tester.getTester(0).executeWaitMultipleData({command})[0]; + + // The socket will close due to the exception, but server stays up. + // We might get a connection error or empty response. + // The key test is that the server is still alive: + + SData pingCommand("Status"); + SData pingResult = tester.getTester(0).executeWaitMultipleData({pingCommand})[0]; + ASSERT_EQUAL(pingResult.methodLine, "200 OK"); + } +} __SignalExceptionTest;