-
Notifications
You must be signed in to change notification settings - Fork 2.2k
use clone3 for exec process creation to reduce cgroup lock contention #4782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ import ( | |
"runtime" | ||
"strconv" | ||
"sync" | ||
"syscall" | ||
"time" | ||
|
||
"github.com/opencontainers/runtime-spec/specs-go" | ||
|
@@ -203,6 +204,28 @@ func (p *setnsProcess) start() (retErr error) { | |
|
||
// Get the "before" value of oom kill count. | ||
oom, _ := p.manager.OOMKillCount() | ||
useClone3 := false | ||
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 { | ||
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) | ||
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) | ||
if initCgErr == nil { | ||
if initCgPath, ok := initCg[""]; ok { | ||
useClone3 = true | ||
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) | ||
fd, err := os.Open(initCgDirpath) | ||
if err != nil { | ||
return fmt.Errorf("error opening cgroup dir %q: %w", initCgDirpath, err) | ||
} | ||
defer fd.Close() | ||
if p.cmd.SysProcAttr == nil { | ||
p.cmd.SysProcAttr = &syscall.SysProcAttr{} | ||
} | ||
p.cmd.SysProcAttr.UseCgroupFD = true | ||
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd()) | ||
Comment on lines
+223
to
+224
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. man clone3 says this is available since linux 5.7. You are setting useClone3 to true, but I don't think that is detecting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or, we can rely on a kernel version (and have it in mind that it can still fail if e.g. clone3 is denied by the security policy). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Usually a "dummy" exec of the syscall (with some nil params or whatever) is enough to know if it's supported. I guess something similar should be possible for this clone_into_cgroup param. Usually that is recommended because people might backport it to old kernels, and if you just check the version you might not use it when it is available. |
||
} | ||
} | ||
} | ||
|
||
err := p.startWithCPUAffinity() | ||
// Close the child-side of the pipes (controlled by child). | ||
p.comm.closeChild() | ||
|
@@ -232,7 +255,11 @@ func (p *setnsProcess) start() (retErr error) { | |
return fmt.Errorf("error executing setns process: %w", err) | ||
} | ||
for _, path := range p.cgroupPaths { | ||
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups { | ||
procPid := p.pid() | ||
if useClone3 { | ||
procPid = -1 | ||
} | ||
if err := cgroups.WriteCgroupProc(path, procPid); err != nil && !p.rootlessCgroups { | ||
Comment on lines
+258
to
+262
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess if it's -1 it's not written? Let's just useClone3 for the condition, explaining that if we are using clone3, then it's already set in the cgroup. |
||
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY. | ||
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 | ||
// Try to join the cgroup of InitProcessPid. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
start() is already complex, let's move this to a function with a clear name, so it's more readable.