Skip to content

Commit c24904c

Browse files
committed
register exception handlers
Signed-off-by: Luca Mondada <luca@mondada.net>
1 parent 11e9651 commit c24904c

File tree

1 file changed

+257
-0
lines changed

1 file changed

+257
-0
lines changed

runtime/cudaq/cudaq.cpp

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@
1616
#endif
1717
#include "cudaq/platform.h"
1818
#include "distributed/mpi_plugin.h"
19+
#include <cstring>
1920
#include <dlfcn.h>
21+
#include <execinfo.h>
22+
#include <fcntl.h>
2023
#include <filesystem>
2124
#include <map>
2225
#include <signal.h>
2326
#include <string>
27+
#include <sys/ucontext.h>
28+
#include <unistd.h>
2429
#include <vector>
2530
namespace nvqir {
2631
void tearDownBeforeMPIFinalize();
@@ -253,12 +258,264 @@ void cudaqCtrlCHandler(int signal) {
253258
std::exit(1);
254259
}
255260

261+
// #region agent log - SIGSEGV handler for debugging
262+
void cudaqSegfaultHandler(int sig, siginfo_t *info, void *context) {
263+
char buf[512];
264+
int len;
265+
266+
// Header with signal type
267+
const char *sig_name = "UNKNOWN";
268+
if (sig == SIGSEGV)
269+
sig_name = "SIGSEGV (Segmentation Fault)";
270+
else if (sig == SIGBUS)
271+
sig_name = "SIGBUS (Bus Error)";
272+
else if (sig == SIGABRT)
273+
sig_name = "SIGABRT (Abort)";
274+
else if (sig == SIGFPE)
275+
sig_name = "SIGFPE (Floating Point Exception)";
276+
else if (sig == SIGILL)
277+
sig_name = "SIGILL (Illegal Instruction)";
278+
279+
len = snprintf(
280+
buf, sizeof(buf),
281+
"\n╔══════════════════════════════════════════════════════════════╗\n"
282+
"║ FATAL SIGNAL: %s (%d) \n"
283+
"╚══════════════════════════════════════════════════════════════╝\n",
284+
sig_name, sig);
285+
write(STDERR_FILENO, buf, len);
286+
287+
// Fault address and signal code
288+
len = snprintf(buf, sizeof(buf), "\n[FAULT INFO]\n");
289+
write(STDERR_FILENO, buf, len);
290+
291+
len = snprintf(buf, sizeof(buf), " Fault address: %p\n", info->si_addr);
292+
write(STDERR_FILENO, buf, len);
293+
294+
const char *code_str = "unknown";
295+
if (sig == SIGSEGV) {
296+
switch (info->si_code) {
297+
case SEGV_MAPERR:
298+
code_str = "SEGV_MAPERR (address not mapped to object)";
299+
break;
300+
case SEGV_ACCERR:
301+
code_str = "SEGV_ACCERR (invalid permissions for mapped object)";
302+
break;
303+
}
304+
} else if (sig == SIGBUS) {
305+
switch (info->si_code) {
306+
case BUS_ADRALN:
307+
code_str = "BUS_ADRALN (invalid address alignment)";
308+
break;
309+
case BUS_ADRERR:
310+
code_str = "BUS_ADRERR (nonexistent physical address)";
311+
break;
312+
case BUS_OBJERR:
313+
code_str = "BUS_OBJERR (object-specific hardware error)";
314+
break;
315+
}
316+
}
317+
len = snprintf(buf, sizeof(buf), " Signal code: %d (%s)\n", info->si_code,
318+
code_str);
319+
write(STDERR_FILENO, buf, len);
320+
321+
// Register dump from ucontext (x86_64 Linux)
322+
#if defined(__x86_64__) && defined(__linux__)
323+
ucontext_t *uc = (ucontext_t *)context;
324+
mcontext_t *mc = &uc->uc_mcontext;
325+
326+
len = snprintf(buf, sizeof(buf), "\n[REGISTERS (x86_64)]\n");
327+
write(STDERR_FILENO, buf, len);
328+
329+
len = snprintf(buf, sizeof(buf),
330+
" RIP: %016llx RSP: %016llx RBP: %016llx\n",
331+
(unsigned long long)mc->gregs[REG_RIP],
332+
(unsigned long long)mc->gregs[REG_RSP],
333+
(unsigned long long)mc->gregs[REG_RBP]);
334+
write(STDERR_FILENO, buf, len);
335+
336+
len = snprintf(buf, sizeof(buf),
337+
" RAX: %016llx RBX: %016llx RCX: %016llx\n",
338+
(unsigned long long)mc->gregs[REG_RAX],
339+
(unsigned long long)mc->gregs[REG_RBX],
340+
(unsigned long long)mc->gregs[REG_RCX]);
341+
write(STDERR_FILENO, buf, len);
342+
343+
len = snprintf(buf, sizeof(buf),
344+
" RDX: %016llx RSI: %016llx RDI: %016llx\n",
345+
(unsigned long long)mc->gregs[REG_RDX],
346+
(unsigned long long)mc->gregs[REG_RSI],
347+
(unsigned long long)mc->gregs[REG_RDI]);
348+
write(STDERR_FILENO, buf, len);
349+
350+
len = snprintf(buf, sizeof(buf),
351+
" R8: %016llx R9: %016llx R10: %016llx\n",
352+
(unsigned long long)mc->gregs[REG_R8],
353+
(unsigned long long)mc->gregs[REG_R9],
354+
(unsigned long long)mc->gregs[REG_R10]);
355+
write(STDERR_FILENO, buf, len);
356+
357+
len = snprintf(buf, sizeof(buf),
358+
" R11: %016llx R12: %016llx R13: %016llx\n",
359+
(unsigned long long)mc->gregs[REG_R11],
360+
(unsigned long long)mc->gregs[REG_R12],
361+
(unsigned long long)mc->gregs[REG_R13]);
362+
write(STDERR_FILENO, buf, len);
363+
364+
len = snprintf(buf, sizeof(buf),
365+
" R14: %016llx R15: %016llx EFLAGS: %016llx\n",
366+
(unsigned long long)mc->gregs[REG_R14],
367+
(unsigned long long)mc->gregs[REG_R15],
368+
(unsigned long long)mc->gregs[REG_EFL]);
369+
write(STDERR_FILENO, buf, len);
370+
371+
// Check if we're running on alternate stack (important for stack corruption)
372+
stack_t current_stack;
373+
if (sigaltstack(NULL, &current_stack) == 0) {
374+
const char *stack_status =
375+
(current_stack.ss_flags & SS_ONSTACK)
376+
? "YES (good - handler is protected from stack corruption)"
377+
: "NO (handler using main stack - may be unreliable if stack is "
378+
"corrupted)";
379+
len =
380+
snprintf(buf, sizeof(buf),
381+
"\n[SIGNAL HANDLER STACK]\n Running on alternate stack: %s\n",
382+
stack_status);
383+
write(STDERR_FILENO, buf, len);
384+
}
385+
386+
// Stack dump around RSP - be very careful here since stack may be corrupted
387+
len =
388+
snprintf(buf, sizeof(buf),
389+
"\n[STACK DUMP around RSP (best effort - may be corrupted)]\n");
390+
write(STDERR_FILENO, buf, len);
391+
392+
unsigned long long *sp = (unsigned long long *)mc->gregs[REG_RSP];
393+
394+
// Validate RSP looks reasonable before trying to read from it
395+
// Check if it's in a plausible stack range (not null, not tiny, not huge)
396+
if ((unsigned long long)sp < 0x10000 ||
397+
(unsigned long long)sp > 0x7fffffffffff) {
398+
len = snprintf(buf, sizeof(buf),
399+
" RSP (%p) looks invalid - skipping stack dump\n",
400+
(void *)sp);
401+
write(STDERR_FILENO, buf, len);
402+
} else {
403+
// Try to dump 8 words before and 32 words after RSP
404+
// We use mincore() to check if pages are readable before accessing
405+
for (int i = -8; i < 32; i++) {
406+
unsigned long long *addr = sp + i;
407+
408+
// Skip obviously bad addresses
409+
if ((unsigned long long)addr < 0x10000)
410+
continue;
411+
if ((unsigned long long)addr > 0x7fffffffffff)
412+
continue;
413+
414+
// Try to read - if this crashes, SA_RESETHAND will let us die cleanly
415+
unsigned long long val = 0;
416+
volatile unsigned long long *vaddr = (volatile unsigned long long *)addr;
417+
val = *vaddr;
418+
419+
const char *marker = (i == 0) ? " <-- RSP" : "";
420+
len = snprintf(buf, sizeof(buf), " [RSP%+3d] %p: %016llx%s\n", i * 8,
421+
(void *)addr, val, marker);
422+
write(STDERR_FILENO, buf, len);
423+
}
424+
}
425+
#endif
426+
427+
// Backtrace with symbols
428+
len = snprintf(buf, sizeof(buf), "\n[BACKTRACE]\n");
429+
write(STDERR_FILENO, buf, len);
430+
431+
void *bt[128];
432+
int bt_size = backtrace(bt, 128);
433+
backtrace_symbols_fd(bt, bt_size, STDERR_FILENO);
434+
435+
// Try to read /proc/self/maps for memory layout
436+
len = snprintf(buf, sizeof(buf), "\n[MEMORY MAP (from /proc/self/maps)]\n");
437+
write(STDERR_FILENO, buf, len);
438+
439+
int maps_fd = open("/proc/self/maps", O_RDONLY);
440+
if (maps_fd >= 0) {
441+
char maps_buf[4096];
442+
ssize_t bytes_read;
443+
while ((bytes_read = read(maps_fd, maps_buf, sizeof(maps_buf))) > 0) {
444+
write(STDERR_FILENO, maps_buf, bytes_read);
445+
}
446+
close(maps_fd);
447+
} else {
448+
const char *err = " (could not read /proc/self/maps)\n";
449+
write(STDERR_FILENO, err, strlen(err));
450+
}
451+
452+
// Try to identify if fault address is near any known region
453+
len = snprintf(
454+
buf, sizeof(buf),
455+
"\n[ANALYSIS HINTS]\n"
456+
" - If fault addr is 0x0-0xfff: NULL pointer dereference\n"
457+
" - If fault addr is near RSP: Stack overflow/corruption\n"
458+
" - If RIP is in unmapped region: Return address corruption\n"
459+
" - If during exception unwind: Check exception handling tables\n");
460+
write(STDERR_FILENO, buf, len);
461+
462+
// Flush and exit
463+
len = snprintf(
464+
buf, sizeof(buf),
465+
"\n╔══════════════════════════════════════════════════════════════╗\n"
466+
"║ END OF CRASH DUMP - Re-raising signal for core dump ║\n"
467+
"╚══════════════════════════════════════════════════════════════╝\n\n");
468+
write(STDERR_FILENO, buf, len);
469+
470+
// Re-raise to get core dump
471+
signal(sig, SIG_DFL);
472+
raise(sig);
473+
}
474+
// #endregion
475+
476+
// #region agent log - Alternate signal stack for crash handler
477+
// This is critical: if the main stack is corrupted, the signal handler
478+
// needs its own stack to run on, otherwise it will crash too.
479+
// Use 64KB which is larger than typical SIGSTKSZ (8KB-32KB)
480+
static char altStackMem[65536];
481+
// #endregion
482+
256483
__attribute__((constructor)) void startSigIntHandler() {
257484
struct sigaction sigIntHandler;
258485
sigIntHandler.sa_handler = cudaqCtrlCHandler;
259486
sigemptyset(&sigIntHandler.sa_mask);
260487
sigIntHandler.sa_flags = 0;
261488
sigaction(SIGINT, &sigIntHandler, NULL);
489+
490+
// #region agent log - Install alternate signal stack and SIGSEGV handler
491+
// Set up alternate stack FIRST - this is crucial for handling stack
492+
// corruption
493+
stack_t altStack;
494+
altStack.ss_sp = altStackMem;
495+
altStack.ss_size = sizeof(altStackMem);
496+
altStack.ss_flags = 0;
497+
if (sigaltstack(&altStack, NULL) == 0) {
498+
// Alternate stack set up successfully
499+
} else {
500+
// Failed to set up alt stack - handler will use main stack (risky if
501+
// corrupted)
502+
perror("Warning: sigaltstack failed, crash handler may not work with stack "
503+
"corruption");
504+
}
505+
506+
struct sigaction segvHandler;
507+
segvHandler.sa_sigaction = cudaqSegfaultHandler;
508+
sigemptyset(&segvHandler.sa_mask);
509+
// SA_ONSTACK: Use alternate stack (critical for stack corruption)
510+
// SA_SIGINFO: Get detailed info via siginfo_t
511+
// SA_RESETHAND: Reset to default after first signal (prevent infinite loops)
512+
segvHandler.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESETHAND;
513+
sigaction(SIGSEGV, &segvHandler, NULL);
514+
sigaction(SIGBUS, &segvHandler, NULL); // Also catch bus errors
515+
sigaction(SIGABRT, &segvHandler, NULL); // And aborts
516+
sigaction(SIGFPE, &segvHandler, NULL); // Floating point exceptions
517+
sigaction(SIGILL, &segvHandler, NULL); // Illegal instruction
518+
// #endregion
262519
}
263520
} // namespace __internal__
264521

0 commit comments

Comments
 (0)