|
16 | 16 | #endif |
17 | 17 | #include "cudaq/platform.h" |
18 | 18 | #include "distributed/mpi_plugin.h" |
| 19 | +#include <cstring> |
19 | 20 | #include <dlfcn.h> |
| 21 | +#include <execinfo.h> |
| 22 | +#include <fcntl.h> |
20 | 23 | #include <filesystem> |
21 | 24 | #include <map> |
22 | 25 | #include <signal.h> |
23 | 26 | #include <string> |
| 27 | +#include <sys/ucontext.h> |
| 28 | +#include <unistd.h> |
24 | 29 | #include <vector> |
25 | 30 | namespace nvqir { |
26 | 31 | void tearDownBeforeMPIFinalize(); |
@@ -253,12 +258,264 @@ void cudaqCtrlCHandler(int signal) { |
253 | 258 | std::exit(1); |
254 | 259 | } |
255 | 260 |
|
| 261 | +// #region agent log - SIGSEGV handler for debugging |
| 262 | +void cudaqSegfaultHandler(int sig, siginfo_t *info, void *context) { |
| 263 | + char buf[512]; |
| 264 | + int len; |
| 265 | + |
| 266 | + // Header with signal type |
| 267 | + const char *sig_name = "UNKNOWN"; |
| 268 | + if (sig == SIGSEGV) |
| 269 | + sig_name = "SIGSEGV (Segmentation Fault)"; |
| 270 | + else if (sig == SIGBUS) |
| 271 | + sig_name = "SIGBUS (Bus Error)"; |
| 272 | + else if (sig == SIGABRT) |
| 273 | + sig_name = "SIGABRT (Abort)"; |
| 274 | + else if (sig == SIGFPE) |
| 275 | + sig_name = "SIGFPE (Floating Point Exception)"; |
| 276 | + else if (sig == SIGILL) |
| 277 | + sig_name = "SIGILL (Illegal Instruction)"; |
| 278 | + |
| 279 | + len = snprintf( |
| 280 | + buf, sizeof(buf), |
| 281 | + "\n╔══════════════════════════════════════════════════════════════╗\n" |
| 282 | + "║ FATAL SIGNAL: %s (%d) \n" |
| 283 | + "╚══════════════════════════════════════════════════════════════╝\n", |
| 284 | + sig_name, sig); |
| 285 | + write(STDERR_FILENO, buf, len); |
| 286 | + |
| 287 | + // Fault address and signal code |
| 288 | + len = snprintf(buf, sizeof(buf), "\n[FAULT INFO]\n"); |
| 289 | + write(STDERR_FILENO, buf, len); |
| 290 | + |
| 291 | + len = snprintf(buf, sizeof(buf), " Fault address: %p\n", info->si_addr); |
| 292 | + write(STDERR_FILENO, buf, len); |
| 293 | + |
| 294 | + const char *code_str = "unknown"; |
| 295 | + if (sig == SIGSEGV) { |
| 296 | + switch (info->si_code) { |
| 297 | + case SEGV_MAPERR: |
| 298 | + code_str = "SEGV_MAPERR (address not mapped to object)"; |
| 299 | + break; |
| 300 | + case SEGV_ACCERR: |
| 301 | + code_str = "SEGV_ACCERR (invalid permissions for mapped object)"; |
| 302 | + break; |
| 303 | + } |
| 304 | + } else if (sig == SIGBUS) { |
| 305 | + switch (info->si_code) { |
| 306 | + case BUS_ADRALN: |
| 307 | + code_str = "BUS_ADRALN (invalid address alignment)"; |
| 308 | + break; |
| 309 | + case BUS_ADRERR: |
| 310 | + code_str = "BUS_ADRERR (nonexistent physical address)"; |
| 311 | + break; |
| 312 | + case BUS_OBJERR: |
| 313 | + code_str = "BUS_OBJERR (object-specific hardware error)"; |
| 314 | + break; |
| 315 | + } |
| 316 | + } |
| 317 | + len = snprintf(buf, sizeof(buf), " Signal code: %d (%s)\n", info->si_code, |
| 318 | + code_str); |
| 319 | + write(STDERR_FILENO, buf, len); |
| 320 | + |
| 321 | + // Register dump from ucontext (x86_64 Linux) |
| 322 | +#if defined(__x86_64__) && defined(__linux__) |
| 323 | + ucontext_t *uc = (ucontext_t *)context; |
| 324 | + mcontext_t *mc = &uc->uc_mcontext; |
| 325 | + |
| 326 | + len = snprintf(buf, sizeof(buf), "\n[REGISTERS (x86_64)]\n"); |
| 327 | + write(STDERR_FILENO, buf, len); |
| 328 | + |
| 329 | + len = snprintf(buf, sizeof(buf), |
| 330 | + " RIP: %016llx RSP: %016llx RBP: %016llx\n", |
| 331 | + (unsigned long long)mc->gregs[REG_RIP], |
| 332 | + (unsigned long long)mc->gregs[REG_RSP], |
| 333 | + (unsigned long long)mc->gregs[REG_RBP]); |
| 334 | + write(STDERR_FILENO, buf, len); |
| 335 | + |
| 336 | + len = snprintf(buf, sizeof(buf), |
| 337 | + " RAX: %016llx RBX: %016llx RCX: %016llx\n", |
| 338 | + (unsigned long long)mc->gregs[REG_RAX], |
| 339 | + (unsigned long long)mc->gregs[REG_RBX], |
| 340 | + (unsigned long long)mc->gregs[REG_RCX]); |
| 341 | + write(STDERR_FILENO, buf, len); |
| 342 | + |
| 343 | + len = snprintf(buf, sizeof(buf), |
| 344 | + " RDX: %016llx RSI: %016llx RDI: %016llx\n", |
| 345 | + (unsigned long long)mc->gregs[REG_RDX], |
| 346 | + (unsigned long long)mc->gregs[REG_RSI], |
| 347 | + (unsigned long long)mc->gregs[REG_RDI]); |
| 348 | + write(STDERR_FILENO, buf, len); |
| 349 | + |
| 350 | + len = snprintf(buf, sizeof(buf), |
| 351 | + " R8: %016llx R9: %016llx R10: %016llx\n", |
| 352 | + (unsigned long long)mc->gregs[REG_R8], |
| 353 | + (unsigned long long)mc->gregs[REG_R9], |
| 354 | + (unsigned long long)mc->gregs[REG_R10]); |
| 355 | + write(STDERR_FILENO, buf, len); |
| 356 | + |
| 357 | + len = snprintf(buf, sizeof(buf), |
| 358 | + " R11: %016llx R12: %016llx R13: %016llx\n", |
| 359 | + (unsigned long long)mc->gregs[REG_R11], |
| 360 | + (unsigned long long)mc->gregs[REG_R12], |
| 361 | + (unsigned long long)mc->gregs[REG_R13]); |
| 362 | + write(STDERR_FILENO, buf, len); |
| 363 | + |
| 364 | + len = snprintf(buf, sizeof(buf), |
| 365 | + " R14: %016llx R15: %016llx EFLAGS: %016llx\n", |
| 366 | + (unsigned long long)mc->gregs[REG_R14], |
| 367 | + (unsigned long long)mc->gregs[REG_R15], |
| 368 | + (unsigned long long)mc->gregs[REG_EFL]); |
| 369 | + write(STDERR_FILENO, buf, len); |
| 370 | + |
| 371 | + // Check if we're running on alternate stack (important for stack corruption) |
| 372 | + stack_t current_stack; |
| 373 | + if (sigaltstack(NULL, ¤t_stack) == 0) { |
| 374 | + const char *stack_status = |
| 375 | + (current_stack.ss_flags & SS_ONSTACK) |
| 376 | + ? "YES (good - handler is protected from stack corruption)" |
| 377 | + : "NO (handler using main stack - may be unreliable if stack is " |
| 378 | + "corrupted)"; |
| 379 | + len = |
| 380 | + snprintf(buf, sizeof(buf), |
| 381 | + "\n[SIGNAL HANDLER STACK]\n Running on alternate stack: %s\n", |
| 382 | + stack_status); |
| 383 | + write(STDERR_FILENO, buf, len); |
| 384 | + } |
| 385 | + |
| 386 | + // Stack dump around RSP - be very careful here since stack may be corrupted |
| 387 | + len = |
| 388 | + snprintf(buf, sizeof(buf), |
| 389 | + "\n[STACK DUMP around RSP (best effort - may be corrupted)]\n"); |
| 390 | + write(STDERR_FILENO, buf, len); |
| 391 | + |
| 392 | + unsigned long long *sp = (unsigned long long *)mc->gregs[REG_RSP]; |
| 393 | + |
| 394 | + // Validate RSP looks reasonable before trying to read from it |
| 395 | + // Check if it's in a plausible stack range (not null, not tiny, not huge) |
| 396 | + if ((unsigned long long)sp < 0x10000 || |
| 397 | + (unsigned long long)sp > 0x7fffffffffff) { |
| 398 | + len = snprintf(buf, sizeof(buf), |
| 399 | + " RSP (%p) looks invalid - skipping stack dump\n", |
| 400 | + (void *)sp); |
| 401 | + write(STDERR_FILENO, buf, len); |
| 402 | + } else { |
| 403 | + // Try to dump 8 words before and 32 words after RSP |
| 404 | + // We use mincore() to check if pages are readable before accessing |
| 405 | + for (int i = -8; i < 32; i++) { |
| 406 | + unsigned long long *addr = sp + i; |
| 407 | + |
| 408 | + // Skip obviously bad addresses |
| 409 | + if ((unsigned long long)addr < 0x10000) |
| 410 | + continue; |
| 411 | + if ((unsigned long long)addr > 0x7fffffffffff) |
| 412 | + continue; |
| 413 | + |
| 414 | + // Try to read - if this crashes, SA_RESETHAND will let us die cleanly |
| 415 | + unsigned long long val = 0; |
| 416 | + volatile unsigned long long *vaddr = (volatile unsigned long long *)addr; |
| 417 | + val = *vaddr; |
| 418 | + |
| 419 | + const char *marker = (i == 0) ? " <-- RSP" : ""; |
| 420 | + len = snprintf(buf, sizeof(buf), " [RSP%+3d] %p: %016llx%s\n", i * 8, |
| 421 | + (void *)addr, val, marker); |
| 422 | + write(STDERR_FILENO, buf, len); |
| 423 | + } |
| 424 | + } |
| 425 | +#endif |
| 426 | + |
| 427 | + // Backtrace with symbols |
| 428 | + len = snprintf(buf, sizeof(buf), "\n[BACKTRACE]\n"); |
| 429 | + write(STDERR_FILENO, buf, len); |
| 430 | + |
| 431 | + void *bt[128]; |
| 432 | + int bt_size = backtrace(bt, 128); |
| 433 | + backtrace_symbols_fd(bt, bt_size, STDERR_FILENO); |
| 434 | + |
| 435 | + // Try to read /proc/self/maps for memory layout |
| 436 | + len = snprintf(buf, sizeof(buf), "\n[MEMORY MAP (from /proc/self/maps)]\n"); |
| 437 | + write(STDERR_FILENO, buf, len); |
| 438 | + |
| 439 | + int maps_fd = open("/proc/self/maps", O_RDONLY); |
| 440 | + if (maps_fd >= 0) { |
| 441 | + char maps_buf[4096]; |
| 442 | + ssize_t bytes_read; |
| 443 | + while ((bytes_read = read(maps_fd, maps_buf, sizeof(maps_buf))) > 0) { |
| 444 | + write(STDERR_FILENO, maps_buf, bytes_read); |
| 445 | + } |
| 446 | + close(maps_fd); |
| 447 | + } else { |
| 448 | + const char *err = " (could not read /proc/self/maps)\n"; |
| 449 | + write(STDERR_FILENO, err, strlen(err)); |
| 450 | + } |
| 451 | + |
| 452 | + // Try to identify if fault address is near any known region |
| 453 | + len = snprintf( |
| 454 | + buf, sizeof(buf), |
| 455 | + "\n[ANALYSIS HINTS]\n" |
| 456 | + " - If fault addr is 0x0-0xfff: NULL pointer dereference\n" |
| 457 | + " - If fault addr is near RSP: Stack overflow/corruption\n" |
| 458 | + " - If RIP is in unmapped region: Return address corruption\n" |
| 459 | + " - If during exception unwind: Check exception handling tables\n"); |
| 460 | + write(STDERR_FILENO, buf, len); |
| 461 | + |
| 462 | + // Flush and exit |
| 463 | + len = snprintf( |
| 464 | + buf, sizeof(buf), |
| 465 | + "\n╔══════════════════════════════════════════════════════════════╗\n" |
| 466 | + "║ END OF CRASH DUMP - Re-raising signal for core dump ║\n" |
| 467 | + "╚══════════════════════════════════════════════════════════════╝\n\n"); |
| 468 | + write(STDERR_FILENO, buf, len); |
| 469 | + |
| 470 | + // Re-raise to get core dump |
| 471 | + signal(sig, SIG_DFL); |
| 472 | + raise(sig); |
| 473 | +} |
| 474 | +// #endregion |
| 475 | + |
| 476 | +// #region agent log - Alternate signal stack for crash handler |
| 477 | +// This is critical: if the main stack is corrupted, the signal handler |
| 478 | +// needs its own stack to run on, otherwise it will crash too. |
| 479 | +// Use 64KB which is larger than typical SIGSTKSZ (8KB-32KB) |
| 480 | +static char altStackMem[65536]; |
| 481 | +// #endregion |
| 482 | + |
256 | 483 | __attribute__((constructor)) void startSigIntHandler() { |
257 | 484 | struct sigaction sigIntHandler; |
258 | 485 | sigIntHandler.sa_handler = cudaqCtrlCHandler; |
259 | 486 | sigemptyset(&sigIntHandler.sa_mask); |
260 | 487 | sigIntHandler.sa_flags = 0; |
261 | 488 | sigaction(SIGINT, &sigIntHandler, NULL); |
| 489 | + |
| 490 | + // #region agent log - Install alternate signal stack and SIGSEGV handler |
| 491 | + // Set up alternate stack FIRST - this is crucial for handling stack |
| 492 | + // corruption |
| 493 | + stack_t altStack; |
| 494 | + altStack.ss_sp = altStackMem; |
| 495 | + altStack.ss_size = sizeof(altStackMem); |
| 496 | + altStack.ss_flags = 0; |
| 497 | + if (sigaltstack(&altStack, NULL) == 0) { |
| 498 | + // Alternate stack set up successfully |
| 499 | + } else { |
| 500 | + // Failed to set up alt stack - handler will use main stack (risky if |
| 501 | + // corrupted) |
| 502 | + perror("Warning: sigaltstack failed, crash handler may not work with stack " |
| 503 | + "corruption"); |
| 504 | + } |
| 505 | + |
| 506 | + struct sigaction segvHandler; |
| 507 | + segvHandler.sa_sigaction = cudaqSegfaultHandler; |
| 508 | + sigemptyset(&segvHandler.sa_mask); |
| 509 | + // SA_ONSTACK: Use alternate stack (critical for stack corruption) |
| 510 | + // SA_SIGINFO: Get detailed info via siginfo_t |
| 511 | + // SA_RESETHAND: Reset to default after first signal (prevent infinite loops) |
| 512 | + segvHandler.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESETHAND; |
| 513 | + sigaction(SIGSEGV, &segvHandler, NULL); |
| 514 | + sigaction(SIGBUS, &segvHandler, NULL); // Also catch bus errors |
| 515 | + sigaction(SIGABRT, &segvHandler, NULL); // And aborts |
| 516 | + sigaction(SIGFPE, &segvHandler, NULL); // Floating point exceptions |
| 517 | + sigaction(SIGILL, &segvHandler, NULL); // Illegal instruction |
| 518 | + // #endregion |
262 | 519 | } |
263 | 520 | } // namespace __internal__ |
264 | 521 |
|
|
0 commit comments