Skip to content

Commit a2ac644

Browse files
committed
dmz: use overlayfs to write-protect /proc/self/exe if possible
Commit b999376 ("nsenter: cloned_binary: remove bindfd logic entirely") removed the read-only bind-mount logic from our cloned binary code because it wasn't really safe because a container with CAP_SYS_ADMIN could remove the MS_RDONLY bit and get write access to /proc/self/exe (even with user namespaces this could've been an issue because it's not clear if the flags are locked). However, copying a binary does seem to have a minor performance impact. The only way to have no performance impact would be for the kernel to block these write attempts, but barring that we could try to reduce the overhead by coming up with a mount that cannot have it's read-only bits cleared. The "simplest" solution is to create a temporary overlayfs using fsopen(2) which uses the directory where runc exists as a lowerdir, ensuring that the container cannot access the underlying file -- and we don't have to do any copies. While fsopen(2) is not free because mount namespace cloning is usually expensive (and so it seems like the difference would be marginal), some basic performance testing seems to indicate there is a ~60% improvement doing it this way and that it has effectively no overhead even when compared to just using /proc/self/exe directly: % hyperfine --warmup 50 \ > "./runc-noclone run -b bundle ctr" \ > "./runc-overlayfs run -b bundle ctr" \ > "./runc-memfd run -b bundle ctr" Benchmark 1: ./runc-noclone run -b bundle ctr Time (mean ± σ): 13.7 ms ± 0.9 ms [User: 6.0 ms, System: 10.9 ms] Range (min … max): 11.3 ms … 16.1 ms 184 runs Benchmark 2: ./runc-overlayfs run -b bundle ctr Time (mean ± σ): 13.9 ms ± 0.9 ms [User: 6.2 ms, System: 10.8 ms] Range (min … max): 11.8 ms … 16.0 ms 180 runs Benchmark 3: ./runc-memfd run -b bundle ctr Time (mean ± σ): 22.6 ms ± 1.3 ms [User: 5.7 ms, System: 20.7 ms] Range (min … max): 19.9 ms … 26.5 ms 114 runs Summary ./runc-noclone run -b bundle ctr ran 1.01 ± 0.09 times faster than ./runc-overlayfs run -b bundle ctr 1.65 ± 0.15 times faster than ./runc-memfd run -b bundle ctr Signed-off-by: Aleksa Sarai <[email protected]>
1 parent 8bebdba commit a2ac644

File tree

4 files changed

+153
-0
lines changed

4 files changed

+153
-0
lines changed

libcontainer/dmz/cloned_binary_linux.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,23 @@ func IsCloned(exe *os.File) bool {
212212
// make sure the container process can never resolve the original runc binary.
213213
// For more details on why this is necessary, see CVE-2019-5736.
214214
func CloneSelfExe(tmpDir string) (*os.File, error) {
215+
// Try to create a temporary overlayfs to produce a readonly version of
216+
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
217+
// to CloneBinary, this technique does not require any extra memory usage
218+
// and does not have the (fairly noticeable) performance impact of copying
219+
// a large binary file into a memfd.
220+
//
221+
// Based on some basic performance testing, the overlayfs approach has
222+
// effectively no performance overhead (it is on par with both
223+
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
224+
// around ~60% overhead during container startup.
225+
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
226+
if err == nil {
227+
logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
228+
return overlayFile, nil
229+
}
230+
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
231+
215232
selfExe, err := os.Open("/proc/self/exe")
216233
if err != nil {
217234
return nil, fmt.Errorf("opening current binary: %w", err)
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package dmz
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"runtime"
8+
"strings"
9+
10+
"golang.org/x/sys/unix"
11+
12+
"github.com/opencontainers/runc/libcontainer/utils"
13+
)
14+
15+
func fsopen(fsName string, flags int) (*os.File, error) {
16+
// Make sure we always set O_CLOEXEC.
17+
flags |= unix.FSOPEN_CLOEXEC
18+
fd, err := unix.Fsopen(fsName, flags)
19+
if err != nil {
20+
return nil, os.NewSyscallError("fsopen "+fsName, err)
21+
}
22+
return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
23+
}
24+
25+
func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
26+
// Make sure we always set O_CLOEXEC.
27+
flags |= unix.FSMOUNT_CLOEXEC
28+
fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
29+
if err != nil {
30+
return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
31+
}
32+
runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
33+
return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
34+
}
35+
36+
func escapeOverlayLowerDir(path string) string {
37+
// If the lowerdir path contains ":" we need to escape them, and if there
38+
// were any escape characters already (\) we need to escape those first.
39+
return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
40+
}
41+
42+
// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
43+
// uses the directory containing the binary as a lowerdir and a temporary tmpfs
44+
// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
45+
// and so we can create a safe zero-copy sealed version of /proc/self/exe.
46+
// This only works for privileged users and on kernels with overlayfs and
47+
// fsopen() enabled.
48+
//
49+
// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
50+
// it is technically possible to create an overlayfs even for rootless
51+
// containers. Unfortunately, this would require some ugly manual CGo+fork
52+
// magic so we can do this later if we feel it's really needed.
53+
func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
54+
// Try to do the superblock creation first to bail out early if we can't
55+
// use this method.
56+
overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
57+
if err != nil {
58+
return nil, err
59+
}
60+
defer overlayCtx.Close()
61+
62+
// binPath is going to be /proc/self/exe, so do a readlink to get the real
63+
// path. overlayfs needs the real underlying directory for this protection
64+
// mode to work properly.
65+
if realPath, err := os.Readlink(binPath); err == nil {
66+
binPath = realPath
67+
}
68+
binLowerDirPath, binName := filepath.Split(binPath)
69+
// Escape any ":"s or "\"s in the path.
70+
binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
71+
72+
// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
73+
// where writes are completely blocked. Ideally we would create a dummy
74+
// tmpfs for this, but it turns out that overlayfs doesn't allow for
75+
// anonymous mountns paths.
76+
// NOTE: I'm working on a patch to fix this but it won't be backported.
77+
dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
78+
79+
// Configure the lowerdirs. The binary lowerdir needs to be on the top to
80+
// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
81+
// mask the binary.
82+
lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
83+
if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
84+
return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
85+
}
86+
87+
// Get an actual handle to the overlayfs.
88+
if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
89+
return nil, os.NewSyscallError("fsconfig create overlayfs", err)
90+
}
91+
overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
92+
if err != nil {
93+
return nil, err
94+
}
95+
defer overlayFd.Close()
96+
97+
// Grab a handle to the binary through overlayfs.
98+
exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
99+
if err != nil {
100+
return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
101+
}
102+
// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
103+
// except this is a little difficult. Depending on what filesystems the
104+
// layers are on, overlayfs can remap the inode numbers (and it always
105+
// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
106+
// basic stat-based check. The only reasonable option would be to hash both
107+
// files and compare them, but this would require fully reading both files
108+
// which would produce a similar performance overhead to memfd cloning.
109+
//
110+
// Ultimately, there isn't a real attack to be worried about here. An
111+
// attacker would need to be able to modify files in /usr/sbin (or wherever
112+
// runc lives), at which point they could just replace the runc binary with
113+
// something malicious anyway.
114+
return exeFile, nil
115+
}

libcontainer/utils/utils_unix.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
346346
}
347347
return err
348348
}
349+
350+
// Openat is a Go-friendly openat(2) wrapper.
351+
func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
352+
dirFd := unix.AT_FDCWD
353+
if dir != nil {
354+
dirFd = int(dir.Fd())
355+
}
356+
flags |= unix.O_CLOEXEC
357+
358+
fd, err := unix.Openat(dirFd, path, flags, mode)
359+
if err != nil {
360+
return nil, &os.PathError{Op: "openat", Path: path, Err: err}
361+
}
362+
return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
363+
}

tests/integration/run.bats

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ function teardown() {
159159
[ "$status" -eq 0 ]
160160
[[ "$output" = *"Hello World"* ]]
161161
[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
162+
if [ "$EUID" -eq 0 ] && is_kernel_gte 5.1 && grep -qFw overlay /proc/filesystems; then
163+
# If the kernel has fsopen() and we have privileges to use it, we will
164+
# use a temporary overlayfs instead of making a memfd clone of
165+
# /proc/self/exe.
166+
[[ "$output" = *"runc-dmz: using overlayfs for sealed /proc/self/exe"* ]]
167+
fi
162168
}
163169

164170
@test "runc run [joining existing container namespaces]" {

0 commit comments

Comments
 (0)