@@ -471,7 +471,6 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t,
471471 SupportedArch syscall_arch = t->detect_syscall_arch ();
472472 t->canonicalize_regs (syscall_arch);
473473 if (!process_syscall_entry (t, step_state, result, syscall_arch)) {
474- step_state->continue_type = RecordSession::DONT_CONTINUE;
475474 return ;
476475 }
477476 *did_enter_syscall = true ;
@@ -508,6 +507,8 @@ static void seccomp_trap_done(RecordTask* t) {
508507 (uint8_t )1 );
509508}
510509
510+ extern void disarm_desched_event (RecordTask *t);
511+ extern void leave_syscallbuf (RecordTask *t);
511512static void handle_seccomp_trap (RecordTask* t,
512513 RecordSession::StepState* step_state,
513514 uint16_t seccomp_data) {
@@ -542,27 +543,21 @@ static void handle_seccomp_trap(RecordTask* t,
542543 }
543544 }
544545
545- if (t->is_in_untraced_syscall ()) {
546- ASSERT (t, !t->delay_syscallbuf_reset_for_seccomp_trap );
547- // Don't reset the syscallbuf immediately after delivering the trap. We have
548- // to wait until this buffered syscall aborts completely before resetting
549- // the buffer.
550- t->delay_syscallbuf_reset_for_seccomp_trap = true ;
551-
552- t->push_event (Event::seccomp_trap ());
553-
546+ bool is_untraced_syscall = t->is_in_untraced_syscall ();
547+ if (is_untraced_syscall) {
554548 // desched may be armed but we're not going to execute the syscall, let
555- // alone block. If it fires, ignore it.
556- t-> write_mem (
557- REMOTE_PTR_FIELD (t-> syscallbuf_child , desched_signal_may_be_relevant),
558- ( uint8_t ) 0 );
549+ // alone block. Disarm the event and if it fires, ignore it.
550+ disarm_desched_event (t);
551+ leave_syscallbuf (t);
552+ r = t-> regs ( );
559553 }
560554
555+ t->canonicalize_regs (t->detect_syscall_arch ());
561556 t->push_syscall_event (syscallno);
562557 t->ev ().Syscall ().failed_during_preparation = true ;
563558 note_entering_syscall (t);
564559
565- if (t-> is_in_untraced_syscall () && !syscall_entry_already_recorded) {
560+ if (is_untraced_syscall && !syscall_entry_already_recorded) {
566561 t->record_current_event ();
567562 }
568563
@@ -578,10 +573,21 @@ static void handle_seccomp_trap(RecordTask* t,
578573 si.native_api .si_code = SYS_SECCOMP;
579574 si.native_api ._sifields ._sigsys ._arch = to_audit_arch (r.arch ());
580575 si.native_api ._sifields ._sigsys ._syscall = syscallno;
576+
581577 // Documentation says that si_call_addr is the address of the syscall
582578 // instruction, but in tests it's immediately after the syscall
583579 // instruction.
584- si.native_api ._sifields ._sigsys ._call_addr = t->ip ().to_data_ptr <void >();
580+ remote_code_ptr seccomp_ip = t->ip ();
581+
582+ /* If we actually deliver this signal, we will fudge the ip value to instead
583+ point into the patched-out syscall. The callee may rely on these values
584+ matching, so do the same adjustment here. */
585+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_jump_stub (seccomp_ip, true );
586+ if (ps) {
587+ seccomp_ip = (ps->patch_addr + (seccomp_ip - ps->stub_addr .as_int ()).register_value () - (ps->size - ps->safe_suffix )).as_int ();
588+ }
589+
590+ si.native_api ._sifields ._sigsys ._call_addr = seccomp_ip.to_data_ptr <void >();
585591 LOG (debug) << " Synthesizing " << si.linux_api ;
586592 t->stash_synthetic_sig (si.linux_api , DETERMINISTIC_SIG);
587593
@@ -591,16 +597,31 @@ static void handle_seccomp_trap(RecordTask* t,
591597 t->set_regs (r);
592598 t->maybe_restore_original_syscall_registers ();
593599
594- if (t->is_in_untraced_syscall ()) {
600+ if (is_untraced_syscall) {
601+ Registers r = t->regs ();
602+ // Cause kernel processing to skip the syscall
603+ r.set_original_syscallno (SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
604+ t->set_regs (r);
605+ uintptr_t orig_arg1 = r.arg1 ();
606+
607+ // The tracee is currently in the seccomp ptrace-stop or syscall-entry stop.
608+ // Advance it to the syscall-exit stop so that when we try to deliver the SIGSYS via
609+ // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
610+ t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
611+ if (t->status ().ptrace_event () == PTRACE_EVENT_SECCOMP) {
612+ t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
613+ }
614+
615+ if (t->arch () == aarch64) {
616+ r = t->regs ();
617+ r.set_arg1 (orig_arg1);
618+ t->set_regs (r);
619+ }
620+
595621 // For buffered syscalls, go ahead and record the exit state immediately.
596622 t->ev ().Syscall ().state = EXITING_SYSCALL;
597623 t->record_current_event ();
598624 t->pop_syscall ();
599-
600- // The tracee is currently in the seccomp ptrace-stop. Advance it to the
601- // syscall-exit stop so that when we try to deliver the SIGSYS via
602- // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
603- t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
604625 }
605626
606627 // Don't continue yet. At the next iteration of record_step, if we
@@ -815,12 +836,6 @@ void RecordSession::task_continue(const StepState& step_state) {
815836 // A task in an emulated ptrace-stop must really stay stopped
816837 ASSERT (t, !t->emulated_stop_pending );
817838
818- bool may_restart = t->at_may_restart_syscall ();
819-
820- if (may_restart && t->seccomp_bpf_enabled ) {
821- LOG (debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev ();
822- }
823-
824839 if (!t->vm ()->first_run_event ()) {
825840 t->vm ()->set_first_run_event (trace_writer ().time ());
826841 }
@@ -892,7 +907,7 @@ void RecordSession::task_continue(const StepState& step_state) {
892907 makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE
893908 traps.
894909 Detect and handle this. */
895- if (!t->seccomp_bpf_enabled || may_restart ||
910+ if (!t->seccomp_bpf_enabled ||
896911 syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) {
897912 resume = RESUME_SYSCALL;
898913 } else {
@@ -1232,6 +1247,17 @@ void RecordSession::syscall_state_changed(RecordTask* t,
12321247 ASSERT (t, t->regs ().original_syscallno () == -1 );
12331248 }
12341249 rec_did_sigreturn (t);
1250+
1251+ /* The inverse of the processing we do during signal delivery - if the IP
1252+ points into a region that we patched out, move us to the extended jump
1253+ patch instead. */
1254+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_syscall_patch (t->ip ());
1255+ if (ps) {
1256+ Registers r = t->regs ();
1257+ r.set_ip ((ps->stub_addr + (r.ip () - ps->patch_addr .as_int ()).register_value () + (ps->size - ps->safe_suffix )).as_int ());
1258+ t->set_regs (r);
1259+ }
1260+
12351261 t->record_current_event ();
12361262 t->pop_syscall ();
12371263
@@ -1500,6 +1526,7 @@ static bool inject_handled_signal(RecordTask* t) {
15001526 t->stashed_signal_processed ();
15011527
15021528 int sig = t->ev ().Signal ().siginfo .si_signo ;
1529+
15031530 do {
15041531 // We are ready to inject our signal.
15051532 // XXX we assume the kernel won't respond by notifying us of a different
@@ -1557,6 +1584,69 @@ static bool inject_handled_signal(RecordTask* t) {
15571584 return true ;
15581585}
15591586
1587+ static ssize_t get_sigframe_size (SupportedArch arch) {
1588+ if (is_x86ish (arch)) {
1589+ // It's somewhat difficult engineering-wise to
1590+ // compute the sigframe size at compile time,
1591+ // and it can vary across kernel versions and CPU
1592+ // microarchitectures. So this size is an overestimate
1593+ // of the real size(s).
1594+ //
1595+ // If this size becomes too small in the
1596+ // future, and unit tests that use sighandlers
1597+ // are run with checksumming enabled, then
1598+ // they can catch errors here.
1599+ return 1152 /* Overestimate of kernel sigframe */ +
1600+ 128 /* Redzone */ +
1601+ /* this returns 512 when XSAVE unsupported */
1602+ xsave_area_size ();
1603+ } else if (arch) {
1604+ return sizeof (ARM64Arch::rt_sigframe) +
1605+ sizeof (ARM64Arch::user_fpsimd_state);
1606+ } else {
1607+ DEBUG_ASSERT (0 && " Add sigframe size for your architecture here" );
1608+ return 0 ;
1609+ }
1610+ }
1611+
1612+ template <typename Arch>
1613+ static remote_ptr<typename Arch::unsigned_long> get_sigframe_ip_ptr (remote_ptr<typename Arch::rt_sigframe> frame_ptr);
1614+
1615+ template <>
1616+ remote_ptr<ARM64Arch::unsigned_long> get_sigframe_ip_ptr<ARM64Arch>(remote_ptr<ARM64Arch::rt_sigframe> frame_ptr) {
1617+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), regs), pc);
1618+ }
1619+
1620+ template <>
1621+ remote_ptr<X86Arch::unsigned_long> get_sigframe_ip_ptr<X86Arch>(remote_ptr<X86Arch::rt_sigframe> frame_ptr) {
1622+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), ip);
1623+ }
1624+
1625+ template <>
1626+ remote_ptr<X64Arch::unsigned_long> get_sigframe_ip_ptr<X64Arch>(remote_ptr<X64Arch::rt_sigframe> frame_ptr) {
1627+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), ip);
1628+ }
1629+
1630+ template <typename Arch>
1631+ static remote_code_ptr get_sigframe_ip_arch (RecordTask *t, remote_ptr<typename Arch::rt_sigframe> frame_ptr)
1632+ {
1633+ return t->read_mem (get_sigframe_ip_ptr<Arch>(frame_ptr));
1634+ }
1635+
1636+ static remote_code_ptr get_sigframe_ip (RecordTask *t, remote_ptr<void > frame_ptr) {
1637+ RR_ARCH_FUNCTION (get_sigframe_ip_arch, t->arch (), t, frame_ptr.as_int ());
1638+ }
1639+
1640+ template <typename Arch>
1641+ static void set_sigframe_ip_arch (RecordTask *t, remote_ptr<typename Arch::rt_sigframe> frame_ptr, remote_code_ptr ip)
1642+ {
1643+ t->write_mem (get_sigframe_ip_ptr<Arch>(frame_ptr), (typename Arch::unsigned_long)ip.register_value ());
1644+ }
1645+
1646+ static void set_sigframe_ip (RecordTask *t, remote_ptr<void > frame_ptr, remote_code_ptr ip) {
1647+ RR_ARCH_FUNCTION (set_sigframe_ip_arch, t->arch (), t, frame_ptr.as_int (), ip);
1648+ }
1649+
15601650/* *
15611651 * |t| is being delivered a signal, and its state changed.
15621652 * Must call t->stashed_signal_processed() once we're ready to unmask signals.
@@ -1601,26 +1691,37 @@ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) {
16011691 break ;
16021692 }
16031693
1604- if (is_x86ish (t->arch ())) {
1605- // It's somewhat difficult engineering-wise to
1606- // compute the sigframe size at compile time,
1607- // and it can vary across kernel versions and CPU
1608- // microarchitectures. So this size is an overestimate
1609- // of the real size(s).
1610- //
1611- // If this size becomes too small in the
1612- // future, and unit tests that use sighandlers
1613- // are run with checksumming enabled, then
1614- // they can catch errors here.
1615- sigframe_size = 1152 /* Overestimate of kernel sigframe */ +
1616- 128 /* Redzone */ +
1617- /* this returns 512 when XSAVE unsupported */
1618- xsave_area_size ();
1619- } else if (t->arch () == aarch64) {
1620- sigframe_size = sizeof (ARM64Arch::rt_sigframe) +
1621- sizeof (ARM64Arch::user_fpsimd_state);
1622- } else {
1623- DEBUG_ASSERT (0 && " Add sigframe size for your architecture here" );
1694+ sigframe_size = get_sigframe_size (t->arch ());
1695+
1696+ /*
1697+ * If we're delivering a signal while in the extended jump patch, pretend we're in the
1698+ * unpatched code instead. That way, any unwinder that makes use of CFI for unwinding
1699+ * will see the correct unwind info of the patch site rather than that of the extended
1700+ * jump patch. The instruction sequence in the original code was of course altered by
1701+ * the patch, so if the signal handler inspects that, it might get confused. However,
1702+ * that is already a general problem with our patching strategy, in that the application
1703+ * is not allowed to read its own code.
1704+ * Naturally, we need to perform the inverse transformation in sigreturn.
1705+ *
1706+ * N.B.: We do this by modifying the sigframe after signal deliver, rather
1707+ * than modifying the registers during signal delivery, because on some platforms
1708+ * (e.g. aarch64, the kernel will adjust the pre-signal registers after the signal stop).
1709+ */
1710+ remote_ptr<ARM64Arch::rt_sigframe> sigframe = t->regs ().sp ().cast <ARM64Arch::rt_sigframe>();
1711+ remote_code_ptr ip = get_sigframe_ip (t, sigframe);
1712+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_jump_stub (ip, true );
1713+ if (ps) {
1714+ uint64_t translated_patch_offset = (ip - ps->stub_addr .as_int ()).register_value () - (ps->size - ps->safe_suffix );
1715+ // We patch out the jump stub with nop, but of course, if we happen to find ourselves
1716+ // in the middle of the nop sled, we just want to end up at the end of the patch
1717+ // region.
1718+ size_t total_patch_region_size = ps->hook ->patch_region_length +
1719+ rr::syscall_instruction_length (t->arch ());
1720+ if (translated_patch_offset > total_patch_region_size) {
1721+ translated_patch_offset = total_patch_region_size;
1722+ }
1723+ set_sigframe_ip (t, sigframe, ps->patch_addr .as_int () + translated_patch_offset);
1724+ LOG (debug) << " Moved ip from extended jump patch to patch area" ;
16241725 }
16251726
16261727 t->ev ().transform (EV_SIGNAL_HANDLER);
@@ -1909,32 +2010,22 @@ static bool is_ptrace_any_sysemu(SupportedArch arch, int command)
19092010bool RecordSession::process_syscall_entry (RecordTask* t, StepState* step_state,
19102011 RecordResult* step_result,
19112012 SupportedArch syscall_arch) {
1912- if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD ()) {
1913- // The only four cases where we allow a stashed signal to be pending on
1914- // syscall entry are:
1915- // -- the signal is a ptrace-related signal, in which case if it's generated
1916- // during a blocking syscall, it does not interrupt the syscall
1917- // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and
1918- // lets us dispatch signals afterward
1919- // -- when we're entering a blocking untraced syscall. If it really blocks,
1920- // we'll get the desched-signal notification and dispatch our stashed
1921- // signal.
1922- // -- when we're doing a privileged syscall that's internal to the preload
1923- // logic
1924- // We do not generally want to have stashed signals pending when we enter
1925- // a syscall, because that will execute with a hacked signal mask
1926- // (see RecordTask::will_resume_execution) which could make things go wrong.
1927- ASSERT (t,
1928- t->desched_rec () || is_rrcall_notify_syscall_hook_exit_syscall (
1929- t->regs ().original_syscallno (), t->arch ()) ||
1930- t->ip () ==
1931- t->vm ()
1932- ->privileged_traced_syscall_ip ()
1933- .increment_by_syscall_insn_length (t->arch ()))
1934- << " Stashed signal pending on syscall entry when it shouldn't be: "
1935- << sig->siginfo << " ; regs=" << t->regs ()
1936- << " ; last_execution_resume=" << t->last_execution_resume ()
1937- << " ; sig ip=" << sig->ip ;
2013+ if (!t->is_in_syscallbuf () && t->stashed_sig_not_synthetic_SIGCHLD ()) {
2014+ // If we have a pending signal, deliver it as if it had happened just before
2015+ // execution of the syscall instruction. To this end, kick us out of the
2016+ // current syscall again and set up the registers for a restart. Regular
2017+ // signal injection will do the rest.
2018+ LOG (debug) << " Entered syscall, but signal pending - setting up pre-syscall signal delivery" ;
2019+ Registers entry_regs = t->regs ();
2020+ Registers r = entry_regs;
2021+ // Cause kernel processing to skip the syscall
2022+ r.set_original_syscallno (SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
2023+ t->set_regs (r);
2024+ t->exit_syscall ();
2025+ entry_regs.set_ip (entry_regs.ip ().decrement_by_syscall_insn_length (syscall_arch));
2026+ entry_regs.set_syscallno (entry_regs.original_syscallno ());
2027+ t->set_regs (entry_regs);
2028+ return false ;
19382029 }
19392030
19402031 // We just entered a syscall.
0 commit comments