Skip to content

Commit 9b80e9f

Browse files
committed
runc exec: use CLONE_INTO_CGROUP when available
It makes sense to make runc exec benefit from clone2(CLONE_INTO_CGROUP), if it is available. Since it requires a recent kernel and might not work, implement a fallback to older way of joining the cgroup. Based on work done in - https://go-review.googlesource.com/c/go/+/417695 - coreos/go-systemd#458 - opencontainers/cgroups#26 - #4822 Signed-off-by: Kir Kolyshkin <[email protected]>
1 parent 77ead42 commit 9b80e9f

File tree

2 files changed

+91
-2
lines changed

2 files changed

+91
-2
lines changed

libcontainer/process_linux.go

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"strconv"
1717
"strings"
1818
"sync"
19+
"syscall"
1920
"time"
2021

2122
"github.com/opencontainers/runtime-spec/specs-go"
@@ -310,18 +311,106 @@ func (p *setnsProcess) addIntoCgroupV2() error {
310311
}
311312

312313
func (p *setnsProcess) addIntoCgroup() error {
314+
if p.cmd.SysProcAttr.UseCgroupFD {
315+
// We've used cgroupfd successfully, so the process is
316+
// already in the proper cgroup, nothing to do here.
317+
return nil
318+
}
313319
if cgroups.IsCgroup2UnifiedMode() {
314320
return p.addIntoCgroupV2()
315321
}
316322
return p.addIntoCgroupV1()
317323
}
318324

325+
// prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
326+
// to join cgroup early, in p.cmd.Start. Returns an *os.File which
327+
// must be closed by the caller after p.Cmd.Start return.
328+
func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
329+
if !cgroups.IsCgroup2UnifiedMode() {
330+
return nil, nil
331+
}
332+
333+
base := p.manager.Path("")
334+
if base == "" { // No cgroup to join.
335+
return nil, nil
336+
}
337+
sub := ""
338+
if p.process.SubCgroupPaths != nil {
339+
sub = p.process.SubCgroupPaths[""]
340+
}
341+
cgroup := path.Join(base, sub)
342+
if !strings.HasPrefix(cgroup, base) {
343+
return nil, fmt.Errorf("bad sub cgroup path: %s", sub)
344+
}
345+
346+
fd, err := os.OpenFile(cgroup, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
347+
if err != nil {
348+
if p.rootlessCgroups {
349+
return nil, nil
350+
}
351+
return nil, fmt.Errorf("can't open cgroup: %w", err)
352+
}
353+
354+
logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
355+
if p.cmd.SysProcAttr == nil {
356+
p.cmd.SysProcAttr = &syscall.SysProcAttr{}
357+
}
358+
p.cmd.SysProcAttr.UseCgroupFD = true
359+
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())
360+
361+
return fd, nil
362+
}
363+
364+
// shouldRetryWithoutCgroupFD tells if the error returned from p.cmd.Start
365+
// could be caused by using cgroupfd.
366+
func (p *setnsProcess) shouldRetryWithoutCgroupFD(err error) bool {
367+
if err == nil || !p.cmd.SysProcAttr.UseCgroupFD {
368+
return false
369+
}
370+
logrus.Debugf("exec with CLONE_INTO_CGROUP failed: %v", err)
371+
372+
switch {
373+
// Cgroup in which a domain controller is enabled.
374+
case errors.Is(err, unix.EBUSY):
375+
return true
376+
// The cgroup is in the domain invalid state.
377+
case errors.Is(err, unix.EOPNOTSUPP):
378+
return true
379+
// Rootless with no direct access to cgroup.
380+
case p.rootlessCgroups && errors.Is(err, unix.EACCES):
381+
return true
382+
// No clone3 syscall (kernels < v5.3).
383+
case errors.Is(err, unix.ENOSYS):
384+
return true
385+
// No CLONE_INTO_CGROUP flag support (kernels v5.3 to v5.7).
386+
case errors.Is(err, unix.E2BIG):
387+
return true
388+
}
389+
390+
return false
391+
}
392+
319393
func (p *setnsProcess) start() (retErr error) {
320394
defer p.comm.closeParent()
321395

396+
fd, err := p.prepareCgroupFD()
397+
if err != nil {
398+
return err
399+
}
400+
322401
// Get the "before" value of oom kill count.
323402
oom, _ := p.manager.OOMKillCount()
324-
err := p.startWithCPUAffinity()
403+
404+
err = p.startWithCPUAffinity()
405+
if fd != nil {
406+
fd.Close()
407+
}
408+
if p.shouldRetryWithoutCgroupFD(err) {
409+
// SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
410+
p.cmd.SysProcAttr.UseCgroupFD = false
411+
err = p.startWithCPUAffinity()
412+
}
413+
325414
// Close the child-side of the pipes (controlled by child).
326415
p.comm.closeChild()
327416
if err != nil {

tests/integration/exec.bats

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ function check_exec_debug() {
282282
# Check we can't join non-existing subcgroup.
283283
runc exec --cgroup nonexistent test_busybox cat /proc/self/cgroup
284284
[ "$status" -ne 0 ]
285-
[[ "$output" == *" adding pid "*"o such file or directory"* ]]
285+
[[ "$output" == *" cgroup"*"o such file or directory"* ]]
286286

287287
# Check we can join top-level cgroup (implicit).
288288
runc exec test_busybox grep '^0::/$' /proc/self/cgroup

0 commit comments

Comments
 (0)