Skip to content

Commit cbd6930

Browse files
committed
Support preadv2 and pwritev2 syscalls.
Previously these were marked UnsupportedSyscall, so rr expected the kernel to return ENOSYS; on modern kernels (Linux 4.6+) the syscall succeeds and rr hits a FATAL assertion. Fixes #3193. Handle them like their v1 counterparts, but with an additional flag whitelist: reject unknown RWF_* flags with EINVAL so that a future kernel addition whose semantics break rr can't silently go wrong. RWF_APPEND is additionally rejected for pwritev2 because the kernel would ignore the user's offset while FileMonitor would still compute it from the arguments and misrecord the write.
1 parent 9e6dfa8 commit cbd6930

6 files changed

Lines changed: 119 additions & 10 deletions

File tree

src/FileMonitor.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ static bool is_implicit_offset_syscall_arch(int syscallno) {
2525
template <typename Arch>
2626
static bool is_write_syscall_arch(int syscallno) {
2727
return syscallno == Arch::writev || syscallno == Arch::write ||
28-
syscallno == Arch::pwrite64 || syscallno == Arch::pwritev;
28+
syscallno == Arch::pwrite64 || syscallno == Arch::pwritev ||
29+
syscallno == Arch::pwritev2;
2930
}
3031

3132
static bool is_implicit_offset_syscall(SupportedArch arch, int syscallno) {
@@ -38,8 +39,10 @@ static int64_t retrieve_offset_arch(Task* t, int syscallno,
3839
switch (syscallno) {
3940
case Arch::pwrite64:
4041
case Arch::pwritev:
42+
case Arch::pwritev2:
4143
case Arch::pread64:
42-
case Arch::preadv: {
44+
case Arch::preadv:
45+
case Arch::preadv2: {
4346
if (sizeof(typename Arch::unsigned_word) == 4) {
4447
return regs.arg4() | (uint64_t(regs.arg5_signed()) << 32);
4548
}

src/Task.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@ void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) {
761761
}
762762

763763
case Arch::pwritev:
764+
case Arch::pwritev2:
764765
case Arch::writev: {
765766
int fd = (int)regs.orig_arg1_signed();
766767
vector<FileMonitor::Range> ranges;

src/record_syscall.cc

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3797,7 +3797,8 @@ static Switchable did_emulate_read(int syscallno, RecordTask* t,
37973797
{
37983798
syscall_state.emulate_result(result);
37993799
record_ranges(t, ranges, result);
3800-
if (syscallno == Arch::pread64 || syscallno == Arch::preadv || result <= 0) {
3800+
if (syscallno == Arch::pread64 || syscallno == Arch::preadv ||
3801+
syscallno == Arch::preadv2 || result <= 0) {
38013802
// Don't perform this syscall.
38023803
Registers r = t->regs();
38033804
r.set_arg1(-1);
@@ -3823,6 +3824,42 @@ static ParamSize select_param_size(intptr_t nfds, SupportedArch arch) {
38233824
return ParamSize(size);
38243825
}
38253826

3827+
// RWF_* flags we know rr records correctly. Reject unknown flags so
3828+
// that a future kernel addition whose semantics break rr (e.g. affecting
3829+
// the offset or bytes we record) cannot silently go wrong; the tracee
3830+
// simply sees EINVAL as if running on an older kernel.
3831+
// RWF_APPEND is excluded from the write mask: when set, the kernel
3832+
// ignores the user's offset and uses/updates the current file position,
3833+
// but FileMonitor::retrieve_offset would compute the explicit offset
3834+
// argument and get it wrong.
3835+
enum {
3836+
RR_RWF_HIPRI = 0x00000001,
3837+
RR_RWF_DSYNC = 0x00000002,
3838+
RR_RWF_SYNC = 0x00000004,
3839+
RR_RWF_NOWAIT = 0x00000008,
3840+
RR_RWF_APPEND = 0x00000010,
3841+
RR_RWF_NOAPPEND = 0x00000020,
3842+
RR_RWF_ATOMIC = 0x00000040,
3843+
RR_RWF_DONTCACHE = 0x00000080,
3844+
};
3845+
static const uint32_t RR_KNOWN_PREADV2_FLAGS =
3846+
RR_RWF_HIPRI | RR_RWF_DSYNC | RR_RWF_SYNC | RR_RWF_NOWAIT |
3847+
RR_RWF_APPEND | RR_RWF_NOAPPEND | RR_RWF_ATOMIC | RR_RWF_DONTCACHE;
3848+
static const uint32_t RR_KNOWN_PWRITEV2_FLAGS =
3849+
RR_KNOWN_PREADV2_FLAGS & ~RR_RWF_APPEND;
3850+
3851+
template <typename Arch>
3852+
static Switchable reject_preadv2_pwritev2(RecordTask* t,
3853+
TaskSyscallState& syscall_state) {
3854+
syscall_state.emulate_result(-EINVAL);
3855+
// Point fd at -1 so the kernel short-circuits the syscall with -EBADF;
3856+
// emulate_result overrides the tracee-visible result to -EINVAL.
3857+
Registers r = t->regs();
3858+
r.set_arg1(-1);
3859+
t->set_regs(r);
3860+
return PREVENT_SWITCH;
3861+
}
3862+
38263863
template <typename Arch>
38273864
static Switchable rec_prepare_syscall_arch(RecordTask* t,
38283865
TaskSyscallState& syscall_state,
@@ -4556,7 +4593,14 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t,
45564593
case Arch::readv:
45574594
/* ssize_t preadv(int fd, const struct iovec *iov, int iovcnt,
45584595
off_t offset); */
4559-
case Arch::preadv: {
4596+
case Arch::preadv:
4597+
/* ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
4598+
off_t offset, int flags); */
4599+
case Arch::preadv2: {
4600+
if (syscallno == Arch::preadv2 &&
4601+
((uint32_t)regs.arg6() & ~RR_KNOWN_PREADV2_FLAGS)) {
4602+
return reject_preadv2_pwritev2<Arch>(t, syscall_state);
4603+
}
45604604
int fd = (int)regs.arg1_signed();
45614605
int iovcnt = (int)regs.arg3_signed();
45624606
remote_ptr<void> iovecsp_void = syscall_state.reg_parameter(
@@ -4583,6 +4627,15 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t,
45834627
return ALLOW_SWITCH;
45844628
}
45854629

4630+
/* ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
4631+
off_t offset, int flags); */
4632+
case Arch::pwritev2: {
4633+
if ((uint32_t)regs.arg6() & ~RR_KNOWN_PWRITEV2_FLAGS) {
4634+
return reject_preadv2_pwritev2<Arch>(t, syscall_state);
4635+
}
4636+
return ALLOW_SWITCH;
4637+
}
4638+
45864639
/* pid_t waitpid(pid_t pid, int *status, int options); */
45874640
/* pid_t wait4(pid_t pid, int *status, int options, struct rusage
45884641
* *rusage);
@@ -7282,6 +7335,8 @@ static void rec_process_syscall_arch(RecordTask* t,
72827335
case Arch::pkey_mprotect:
72837336
case Arch::pread64:
72847337
case Arch::preadv:
7338+
case Arch::preadv2:
7339+
case Arch::pwritev2:
72857340
case Arch::ptrace:
72867341
case Arch::read:
72877342
case Arch::readv:

src/syscalls.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,8 +1687,8 @@ def __init__(self, **kwargs):
16871687
membarrier = EmulatedSyscall(x86=375, x64=324, generic=283)
16881688
mlock2 = UnsupportedSyscall(x86=376, x64=325, generic=284)
16891689
copy_file_range = IrregularEmulatedSyscall(x86=377, x64=326, generic=285)
1690-
preadv2 = UnsupportedSyscall(x86=378, x64=327, generic=286)
1691-
pwritev2 = UnsupportedSyscall(x86=379, x64=328, generic=287)
1690+
preadv2 = IrregularEmulatedSyscall(x86=378, x64=327, generic=286)
1691+
pwritev2 = IrregularEmulatedSyscall(x86=379, x64=328, generic=287)
16921692
pkey_mprotect = IrregularEmulatedSyscall(x86=380, x64=329, generic=288)
16931693
pkey_alloc = EmulatedSyscall(x86=381, x64=330, generic=289)
16941694
pkey_free = EmulatedSyscall(x86=382, x64=331, generic=290)

src/test/readv.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
22

33
#include "util.h"
4+
#include "util_internal.h"
45

56
static char data[11] = "0123456789";
67

7-
static void test(int use_preadv) {
8+
static void test(int mode) {
89
static const char name[] = "temp";
910
int fd = open(name, O_CREAT | O_RDWR | O_EXCL, 0600);
1011
struct {
@@ -26,9 +27,11 @@ static void test(int use_preadv) {
2627
iovs[0].iov_len = sizeof(*part1);
2728
iovs[1].iov_base = part2;
2829
iovs[1].iov_len = sizeof(*part2);
29-
if (use_preadv) {
30+
if (mode == 1) {
3031
/* Work around busted preadv prototype in older libcs */
3132
nread = syscall(SYS_preadv, fd, iovs, 2, (off_t)0, 0);
33+
} else if (mode == 2) {
34+
nread = syscall(SYS_preadv2, fd, iovs, 2, (off_t)0, 0, 0);
3235
} else {
3336
test_assert(0 == lseek(fd, 0, SEEK_SET));
3437
nread = readv(fd, iovs, 2);
@@ -43,8 +46,29 @@ static void test(int use_preadv) {
4346
}
4447

4548
int main(void) {
49+
int fd;
50+
struct iovec iov;
51+
char buf;
52+
ssize_t ret;
53+
4654
test(0);
4755
test(1);
56+
test(2);
57+
58+
/* Unknown RWF_ flags must be rejected with EINVAL so that future
59+
* kernel additions can't silently break rr's recording. Only rr
60+
* is guaranteed to produce EINVAL here; the bare kernel may accept
61+
* such flags or return a different error. */
62+
if (running_under_rr()) {
63+
fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600);
64+
test_assert(fd >= 0);
65+
test_assert(0 == unlink("temp2"));
66+
iov.iov_base = &buf;
67+
iov.iov_len = 1;
68+
ret = syscall(SYS_preadv2, fd, &iov, 1, (off_t)0, 0, 0x40000000);
69+
test_assert(ret == -1 && errno == EINVAL);
70+
close(fd);
71+
}
4872

4973
atomic_puts("EXIT-SUCCESS");
5074
return 0;

src/test/writev.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
22

33
#include "util.h"
4+
#include "util_internal.h"
45

56
static char data[11] = "0123456789";
67

7-
static void test(int use_pwritev) {
8+
static void test(int mode) {
89
static const char name[] = "temp";
910
int fd = open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
1011
struct {
@@ -20,9 +21,11 @@ static void test(int use_pwritev) {
2021
iovs[0].iov_len = 7;
2122
iovs[1].iov_base = data + iovs[0].iov_len;
2223
iovs[1].iov_len = sizeof(data) - iovs[0].iov_len;
23-
if (use_pwritev) {
24+
if (mode == 1) {
2425
/* Work around busted pwritev prototype in older libcs */
2526
nwritten = syscall(SYS_pwritev, fd, iovs, 2, (off_t)0, 0);
27+
} else if (mode == 2) {
28+
nwritten = syscall(SYS_pwritev2, fd, iovs, 2, (off_t)0, 0, 0);
2629
} else {
2730
nwritten = writev(fd, iovs, 2);
2831
}
@@ -36,8 +39,31 @@ static void test(int use_pwritev) {
3639
}
3740

3841
int main(void) {
42+
int fd;
43+
struct iovec iov;
44+
char buf = 'a';
45+
ssize_t ret;
46+
3947
test(0);
4048
test(1);
49+
test(2);
50+
51+
/* rr rejects RWF_APPEND (0x10): the kernel would ignore the explicit
52+
* offset, but FileMonitor would compute it from the arguments and
53+
* get it wrong. Unknown flags are rejected for the same kind of
54+
* reason. Only rr is guaranteed to produce EINVAL here. */
55+
if (running_under_rr()) {
56+
fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600);
57+
test_assert(fd >= 0);
58+
test_assert(0 == unlink("temp2"));
59+
iov.iov_base = &buf;
60+
iov.iov_len = 1;
61+
ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x10 /*RWF_APPEND*/);
62+
test_assert(ret == -1 && errno == EINVAL);
63+
ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x40000000);
64+
test_assert(ret == -1 && errno == EINVAL);
65+
close(fd);
66+
}
4167

4268
atomic_puts("EXIT-SUCCESS");
4369
return 0;

0 commit comments

Comments
 (0)