From cbd6930548bfbb5319b2c04e2b50ca31701c90f8 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 13 Apr 2026 10:32:02 +0200 Subject: [PATCH] Support preadv2 and pwritev2 syscalls. Previously these were marked UnsupportedSyscall, so rr expected the kernel to return ENOSYS; on modern kernels (Linux 4.6+) the syscall succeeds and rr hits a FATAL assertion. Fixes #3193. Handle them like their v1 counterparts, but with an additional flag whitelist: reject unknown RWF_* flags with EINVAL so that a future kernel addition whose semantics break rr can't silently go wrong. RWF_APPEND is additionally rejected for pwritev2 because the kernel would ignore the user's offset while FileMonitor would still compute it from the arguments and misrecord the write. --- src/FileMonitor.cc | 7 +++-- src/Task.cc | 1 + src/record_syscall.cc | 59 +++++++++++++++++++++++++++++++++++++++++-- src/syscalls.py | 4 +-- src/test/readv.c | 28 ++++++++++++++++++-- src/test/writev.c | 30 ++++++++++++++++++++-- 6 files changed, 119 insertions(+), 10 deletions(-) diff --git a/src/FileMonitor.cc b/src/FileMonitor.cc index 340d55b1a4c..b91658d1123 100644 --- a/src/FileMonitor.cc +++ b/src/FileMonitor.cc @@ -25,7 +25,8 @@ static bool is_implicit_offset_syscall_arch(int syscallno) { template static bool is_write_syscall_arch(int syscallno) { return syscallno == Arch::writev || syscallno == Arch::write || - syscallno == Arch::pwrite64 || syscallno == Arch::pwritev; + syscallno == Arch::pwrite64 || syscallno == Arch::pwritev || + syscallno == Arch::pwritev2; } static bool is_implicit_offset_syscall(SupportedArch arch, int syscallno) { @@ -38,8 +39,10 @@ static int64_t retrieve_offset_arch(Task* t, int syscallno, switch (syscallno) { case Arch::pwrite64: case Arch::pwritev: + case Arch::pwritev2: case Arch::pread64: - case Arch::preadv: { + case Arch::preadv: + case Arch::preadv2: { if (sizeof(typename Arch::unsigned_word) == 4) { return regs.arg4() | (uint64_t(regs.arg5_signed()) << 32); } diff --git a/src/Task.cc b/src/Task.cc index 7d236f66013..f8c8541230f 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -761,6 +761,7 @@ void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) { } case Arch::pwritev: + case Arch::pwritev2: case Arch::writev: { int fd = (int)regs.orig_arg1_signed(); vector ranges; diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 6eccada9e98..5d911233b23 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -3797,7 +3797,8 @@ static Switchable did_emulate_read(int syscallno, RecordTask* t, { syscall_state.emulate_result(result); record_ranges(t, ranges, result); - if (syscallno == Arch::pread64 || syscallno == Arch::preadv || result <= 0) { + if (syscallno == Arch::pread64 || syscallno == Arch::preadv || + syscallno == Arch::preadv2 || result <= 0) { // Don't perform this syscall. Registers r = t->regs(); r.set_arg1(-1); @@ -3823,6 +3824,42 @@ static ParamSize select_param_size(intptr_t nfds, SupportedArch arch) { return ParamSize(size); } +// RWF_* flags we know rr records correctly. Reject unknown flags so +// that a future kernel addition whose semantics break rr (e.g. affecting +// the offset or bytes we record) cannot silently go wrong; the tracee +// simply sees EINVAL as if running on an older kernel. +// RWF_APPEND is excluded from the write mask: when set, the kernel +// ignores the user's offset and uses/updates the current file position, +// but FileMonitor::retrieve_offset would compute the explicit offset +// argument and get it wrong. +enum { + RR_RWF_HIPRI = 0x00000001, + RR_RWF_DSYNC = 0x00000002, + RR_RWF_SYNC = 0x00000004, + RR_RWF_NOWAIT = 0x00000008, + RR_RWF_APPEND = 0x00000010, + RR_RWF_NOAPPEND = 0x00000020, + RR_RWF_ATOMIC = 0x00000040, + RR_RWF_DONTCACHE = 0x00000080, +}; +static const uint32_t RR_KNOWN_PREADV2_FLAGS = + RR_RWF_HIPRI | RR_RWF_DSYNC | RR_RWF_SYNC | RR_RWF_NOWAIT | + RR_RWF_APPEND | RR_RWF_NOAPPEND | RR_RWF_ATOMIC | RR_RWF_DONTCACHE; +static const uint32_t RR_KNOWN_PWRITEV2_FLAGS = + RR_KNOWN_PREADV2_FLAGS & ~RR_RWF_APPEND; + +template +static Switchable reject_preadv2_pwritev2(RecordTask* t, + TaskSyscallState& syscall_state) { + syscall_state.emulate_result(-EINVAL); + // Point fd at -1 so the kernel short-circuits the syscall with -EBADF; + // emulate_result overrides the tracee-visible result to -EINVAL. + Registers r = t->regs(); + r.set_arg1(-1); + t->set_regs(r); + return PREVENT_SWITCH; +} + template static Switchable rec_prepare_syscall_arch(RecordTask* t, TaskSyscallState& syscall_state, @@ -4556,7 +4593,14 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, case Arch::readv: /* ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset); */ - case Arch::preadv: { + case Arch::preadv: + /* ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, + off_t offset, int flags); */ + case Arch::preadv2: { + if (syscallno == Arch::preadv2 && + ((uint32_t)regs.arg6() & ~RR_KNOWN_PREADV2_FLAGS)) { + return reject_preadv2_pwritev2(t, syscall_state); + } int fd = (int)regs.arg1_signed(); int iovcnt = (int)regs.arg3_signed(); remote_ptr iovecsp_void = syscall_state.reg_parameter( @@ -4583,6 +4627,15 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, return ALLOW_SWITCH; } + /* ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, + off_t offset, int flags); */ + case Arch::pwritev2: { + if ((uint32_t)regs.arg6() & ~RR_KNOWN_PWRITEV2_FLAGS) { + return reject_preadv2_pwritev2(t, syscall_state); + } + return ALLOW_SWITCH; + } + /* pid_t waitpid(pid_t pid, int *status, int options); */ /* pid_t wait4(pid_t pid, int *status, int options, struct rusage * *rusage); @@ -7282,6 +7335,8 @@ static void rec_process_syscall_arch(RecordTask* t, case Arch::pkey_mprotect: case Arch::pread64: case Arch::preadv: + case Arch::preadv2: + case Arch::pwritev2: case Arch::ptrace: case Arch::read: case Arch::readv: diff --git a/src/syscalls.py b/src/syscalls.py index 32b89f4d435..df54642d931 100644 --- a/src/syscalls.py +++ b/src/syscalls.py @@ -1687,8 +1687,8 @@ def __init__(self, **kwargs): membarrier = EmulatedSyscall(x86=375, x64=324, generic=283) mlock2 = UnsupportedSyscall(x86=376, x64=325, generic=284) copy_file_range = IrregularEmulatedSyscall(x86=377, x64=326, generic=285) -preadv2 = UnsupportedSyscall(x86=378, x64=327, generic=286) -pwritev2 = UnsupportedSyscall(x86=379, x64=328, generic=287) +preadv2 = IrregularEmulatedSyscall(x86=378, x64=327, generic=286) +pwritev2 = IrregularEmulatedSyscall(x86=379, x64=328, generic=287) pkey_mprotect = IrregularEmulatedSyscall(x86=380, x64=329, generic=288) pkey_alloc = EmulatedSyscall(x86=381, x64=330, generic=289) pkey_free = EmulatedSyscall(x86=382, x64=331, generic=290) diff --git a/src/test/readv.c b/src/test/readv.c index cd90d5ea118..13285e347ed 100644 --- a/src/test/readv.c +++ b/src/test/readv.c @@ -1,10 +1,11 @@ /* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "util.h" +#include "util_internal.h" static char data[11] = "0123456789"; -static void test(int use_preadv) { +static void test(int mode) { static const char name[] = "temp"; int fd = open(name, O_CREAT | O_RDWR | O_EXCL, 0600); struct { @@ -26,9 +27,11 @@ static void test(int use_preadv) { iovs[0].iov_len = sizeof(*part1); iovs[1].iov_base = part2; iovs[1].iov_len = sizeof(*part2); - if (use_preadv) { + if (mode == 1) { /* Work around busted preadv prototype in older libcs */ nread = syscall(SYS_preadv, fd, iovs, 2, (off_t)0, 0); + } else if (mode == 2) { + nread = syscall(SYS_preadv2, fd, iovs, 2, (off_t)0, 0, 0); } else { test_assert(0 == lseek(fd, 0, SEEK_SET)); nread = readv(fd, iovs, 2); @@ -43,8 +46,29 @@ static void test(int use_preadv) { } int main(void) { + int fd; + struct iovec iov; + char buf; + ssize_t ret; + test(0); test(1); + test(2); + + /* Unknown RWF_ flags must be rejected with EINVAL so that future + * kernel additions can't silently break rr's recording. Only rr + * is guaranteed to produce EINVAL here; the bare kernel may accept + * such flags or return a different error. */ + if (running_under_rr()) { + fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600); + test_assert(fd >= 0); + test_assert(0 == unlink("temp2")); + iov.iov_base = &buf; + iov.iov_len = 1; + ret = syscall(SYS_preadv2, fd, &iov, 1, (off_t)0, 0, 0x40000000); + test_assert(ret == -1 && errno == EINVAL); + close(fd); + } atomic_puts("EXIT-SUCCESS"); return 0; diff --git a/src/test/writev.c b/src/test/writev.c index c75f45dda6b..c89680676fb 100644 --- a/src/test/writev.c +++ b/src/test/writev.c @@ -1,10 +1,11 @@ /* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "util.h" +#include "util_internal.h" static char data[11] = "0123456789"; -static void test(int use_pwritev) { +static void test(int mode) { static const char name[] = "temp"; int fd = open(name, O_CREAT | O_EXCL | O_RDWR, 0600); struct { @@ -20,9 +21,11 @@ static void test(int use_pwritev) { iovs[0].iov_len = 7; iovs[1].iov_base = data + iovs[0].iov_len; iovs[1].iov_len = sizeof(data) - iovs[0].iov_len; - if (use_pwritev) { + if (mode == 1) { /* Work around busted pwritev prototype in older libcs */ nwritten = syscall(SYS_pwritev, fd, iovs, 2, (off_t)0, 0); + } else if (mode == 2) { + nwritten = syscall(SYS_pwritev2, fd, iovs, 2, (off_t)0, 0, 0); } else { nwritten = writev(fd, iovs, 2); } @@ -36,8 +39,31 @@ static void test(int use_pwritev) { } int main(void) { + int fd; + struct iovec iov; + char buf = 'a'; + ssize_t ret; + test(0); test(1); + test(2); + + /* rr rejects RWF_APPEND (0x10): the kernel would ignore the explicit + * offset, but FileMonitor would compute it from the arguments and + * get it wrong. Unknown flags are rejected for the same kind of + * reason. Only rr is guaranteed to produce EINVAL here. */ + if (running_under_rr()) { + fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600); + test_assert(fd >= 0); + test_assert(0 == unlink("temp2")); + iov.iov_base = &buf; + iov.iov_len = 1; + ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x10 /*RWF_APPEND*/); + test_assert(ret == -1 && errno == EINVAL); + ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x40000000); + test_assert(ret == -1 && errno == EINVAL); + close(fd); + } atomic_puts("EXIT-SUCCESS"); return 0;