Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/FileMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ static bool is_implicit_offset_syscall_arch(int syscallno) {
template <typename Arch>
static bool is_write_syscall_arch(int syscallno) {
return syscallno == Arch::writev || syscallno == Arch::write ||
syscallno == Arch::pwrite64 || syscallno == Arch::pwritev;
syscallno == Arch::pwrite64 || syscallno == Arch::pwritev ||
syscallno == Arch::pwritev2;
}

static bool is_implicit_offset_syscall(SupportedArch arch, int syscallno) {
Expand All @@ -38,8 +39,10 @@ static int64_t retrieve_offset_arch(Task* t, int syscallno,
switch (syscallno) {
case Arch::pwrite64:
case Arch::pwritev:
case Arch::pwritev2:
case Arch::pread64:
case Arch::preadv: {
case Arch::preadv:
case Arch::preadv2: {
if (sizeof(typename Arch::unsigned_word) == 4) {
return regs.arg4() | (uint64_t(regs.arg5_signed()) << 32);
}
Expand Down
1 change: 1 addition & 0 deletions src/Task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) {
}

case Arch::pwritev:
case Arch::pwritev2:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the RWF_APPEND flag is set then the kernel ignores the user's offset, but we'll compute an incorrect offset below and things will go wrong. (I think we already have an existing bug if the user passes O_APPEND to open(); in that case pwritev will ignore the offset and we compute the wrong offset.)

So let's do what I said above and specify a mask of flags we know we can handle correctly --- I think that's everything currently defined except for RWF_APPEND --- and return EINVAL if any other flags are set. That will require adding code in record_syscall.cc for pwritev2.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for taking a look! Fixed

case Arch::writev: {
int fd = (int)regs.orig_arg1_signed();
vector<FileMonitor::Range> ranges;
Expand Down
59 changes: 57 additions & 2 deletions src/record_syscall.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3797,7 +3797,8 @@ static Switchable did_emulate_read(int syscallno, RecordTask* t,
{
syscall_state.emulate_result(result);
record_ranges(t, ranges, result);
if (syscallno == Arch::pread64 || syscallno == Arch::preadv || result <= 0) {
if (syscallno == Arch::pread64 || syscallno == Arch::preadv ||
syscallno == Arch::preadv2 || result <= 0) {
// Don't perform this syscall.
Registers r = t->regs();
r.set_arg1(-1);
Expand All @@ -3823,6 +3824,42 @@ static ParamSize select_param_size(intptr_t nfds, SupportedArch arch) {
return ParamSize(size);
}

// RWF_* flags we know rr records correctly. Reject unknown flags so
// that a future kernel addition whose semantics break rr (e.g. affecting
// the offset or bytes we record) cannot silently go wrong; the tracee
// simply sees EINVAL as if running on an older kernel.
// RWF_APPEND is excluded from the write mask: when set, the kernel
// ignores the user's offset and uses/updates the current file position,
// but FileMonitor::retrieve_offset would compute the explicit offset
// argument and get it wrong.
enum {
RR_RWF_HIPRI = 0x00000001,
RR_RWF_DSYNC = 0x00000002,
RR_RWF_SYNC = 0x00000004,
RR_RWF_NOWAIT = 0x00000008,
RR_RWF_APPEND = 0x00000010,
RR_RWF_NOAPPEND = 0x00000020,
RR_RWF_ATOMIC = 0x00000040,
RR_RWF_DONTCACHE = 0x00000080,
};
static const uint32_t RR_KNOWN_PREADV2_FLAGS =
RR_RWF_HIPRI | RR_RWF_DSYNC | RR_RWF_SYNC | RR_RWF_NOWAIT |
RR_RWF_APPEND | RR_RWF_NOAPPEND | RR_RWF_ATOMIC | RR_RWF_DONTCACHE;
static const uint32_t RR_KNOWN_PWRITEV2_FLAGS =
RR_KNOWN_PREADV2_FLAGS & ~RR_RWF_APPEND;

template <typename Arch>
static Switchable reject_preadv2_pwritev2(RecordTask* t,
TaskSyscallState& syscall_state) {
syscall_state.emulate_result(-EINVAL);
// Point fd at -1 so the kernel short-circuits the syscall with -EBADF;
// emulate_result overrides the tracee-visible result to -EINVAL.
Registers r = t->regs();
r.set_arg1(-1);
t->set_regs(r);
return PREVENT_SWITCH;
}

template <typename Arch>
static Switchable rec_prepare_syscall_arch(RecordTask* t,
TaskSyscallState& syscall_state,
Expand Down Expand Up @@ -4556,7 +4593,14 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t,
case Arch::readv:
/* ssize_t preadv(int fd, const struct iovec *iov, int iovcnt,
off_t offset); */
case Arch::preadv: {
case Arch::preadv:
/* ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
off_t offset, int flags); */
case Arch::preadv2: {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's possible that some future RWF_ flag will introduce behaviour that breaks rr. So it would be a good idea to define a set of flags that we know are ok with rr with preadv2 (all the flags currently defined, I think), and fail here with EINVAL if some flag outside that set is passed in. See below for why this is especially important for pwritev2.

if (syscallno == Arch::preadv2 &&
((uint32_t)regs.arg6() & ~RR_KNOWN_PREADV2_FLAGS)) {
return reject_preadv2_pwritev2<Arch>(t, syscall_state);
}
int fd = (int)regs.arg1_signed();
int iovcnt = (int)regs.arg3_signed();
remote_ptr<void> iovecsp_void = syscall_state.reg_parameter(
Expand All @@ -4583,6 +4627,15 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t,
return ALLOW_SWITCH;
}

/* ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
off_t offset, int flags); */
case Arch::pwritev2: {
if ((uint32_t)regs.arg6() & ~RR_KNOWN_PWRITEV2_FLAGS) {
return reject_preadv2_pwritev2<Arch>(t, syscall_state);
}
return ALLOW_SWITCH;
}

/* pid_t waitpid(pid_t pid, int *status, int options); */
/* pid_t wait4(pid_t pid, int *status, int options, struct rusage
* *rusage);
Expand Down Expand Up @@ -7282,6 +7335,8 @@ static void rec_process_syscall_arch(RecordTask* t,
case Arch::pkey_mprotect:
case Arch::pread64:
case Arch::preadv:
case Arch::preadv2:
case Arch::pwritev2:
case Arch::ptrace:
case Arch::read:
case Arch::readv:
Expand Down
4 changes: 2 additions & 2 deletions src/syscalls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1687,8 +1687,8 @@ def __init__(self, **kwargs):
membarrier = EmulatedSyscall(x86=375, x64=324, generic=283)
mlock2 = UnsupportedSyscall(x86=376, x64=325, generic=284)
copy_file_range = IrregularEmulatedSyscall(x86=377, x64=326, generic=285)
preadv2 = UnsupportedSyscall(x86=378, x64=327, generic=286)
pwritev2 = UnsupportedSyscall(x86=379, x64=328, generic=287)
preadv2 = IrregularEmulatedSyscall(x86=378, x64=327, generic=286)
pwritev2 = IrregularEmulatedSyscall(x86=379, x64=328, generic=287)
pkey_mprotect = IrregularEmulatedSyscall(x86=380, x64=329, generic=288)
pkey_alloc = EmulatedSyscall(x86=381, x64=330, generic=289)
pkey_free = EmulatedSyscall(x86=382, x64=331, generic=290)
Expand Down
28 changes: 26 additions & 2 deletions src/test/readv.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#include "util.h"
#include "util_internal.h"

static char data[11] = "0123456789";

static void test(int use_preadv) {
static void test(int mode) {
static const char name[] = "temp";
int fd = open(name, O_CREAT | O_RDWR | O_EXCL, 0600);
struct {
Expand All @@ -26,9 +27,11 @@ static void test(int use_preadv) {
iovs[0].iov_len = sizeof(*part1);
iovs[1].iov_base = part2;
iovs[1].iov_len = sizeof(*part2);
if (use_preadv) {
if (mode == 1) {
/* Work around busted preadv prototype in older libcs */
nread = syscall(SYS_preadv, fd, iovs, 2, (off_t)0, 0);
} else if (mode == 2) {
nread = syscall(SYS_preadv2, fd, iovs, 2, (off_t)0, 0, 0);
} else {
test_assert(0 == lseek(fd, 0, SEEK_SET));
nread = readv(fd, iovs, 2);
Expand All @@ -43,8 +46,29 @@ static void test(int use_preadv) {
}

int main(void) {
int fd;
struct iovec iov;
char buf;
ssize_t ret;

test(0);
test(1);
test(2);

/* Unknown RWF_ flags must be rejected with EINVAL so that future
* kernel additions can't silently break rr's recording. Only rr
* is guaranteed to produce EINVAL here; the bare kernel may accept
* such flags or return a different error. */
if (running_under_rr()) {
fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600);
test_assert(fd >= 0);
test_assert(0 == unlink("temp2"));
iov.iov_base = &buf;
iov.iov_len = 1;
ret = syscall(SYS_preadv2, fd, &iov, 1, (off_t)0, 0, 0x40000000);
test_assert(ret == -1 && errno == EINVAL);
close(fd);
}

atomic_puts("EXIT-SUCCESS");
return 0;
Expand Down
30 changes: 28 additions & 2 deletions src/test/writev.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#include "util.h"
#include "util_internal.h"

static char data[11] = "0123456789";

static void test(int use_pwritev) {
static void test(int mode) {
static const char name[] = "temp";
int fd = open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
struct {
Expand All @@ -20,9 +21,11 @@ static void test(int use_pwritev) {
iovs[0].iov_len = 7;
iovs[1].iov_base = data + iovs[0].iov_len;
iovs[1].iov_len = sizeof(data) - iovs[0].iov_len;
if (use_pwritev) {
if (mode == 1) {
/* Work around busted pwritev prototype in older libcs */
nwritten = syscall(SYS_pwritev, fd, iovs, 2, (off_t)0, 0);
} else if (mode == 2) {
nwritten = syscall(SYS_pwritev2, fd, iovs, 2, (off_t)0, 0, 0);
} else {
nwritten = writev(fd, iovs, 2);
}
Expand All @@ -36,8 +39,31 @@ static void test(int use_pwritev) {
}

int main(void) {
int fd;
struct iovec iov;
char buf = 'a';
ssize_t ret;

test(0);
test(1);
test(2);

/* rr rejects RWF_APPEND (0x10): the kernel would ignore the explicit
* offset, but FileMonitor would compute it from the arguments and
* get it wrong. Unknown flags are rejected for the same kind of
* reason. Only rr is guaranteed to produce EINVAL here. */
if (running_under_rr()) {
fd = open("temp2", O_CREAT | O_RDWR | O_EXCL, 0600);
test_assert(fd >= 0);
test_assert(0 == unlink("temp2"));
iov.iov_base = &buf;
iov.iov_len = 1;
ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x10 /*RWF_APPEND*/);
test_assert(ret == -1 && errno == EINVAL);
ret = syscall(SYS_pwritev2, fd, &iov, 1, (off_t)0, 0, 0x40000000);
test_assert(ret == -1 && errno == EINVAL);
close(fd);
}

atomic_puts("EXIT-SUCCESS");
return 0;
Expand Down
Loading