@@ -16,6 +16,7 @@ import (
16
16
"strconv"
17
17
"strings"
18
18
"sync"
19
+ "syscall"
19
20
"time"
20
21
21
22
"github.com/opencontainers/runtime-spec/specs-go"
@@ -310,18 +311,106 @@ func (p *setnsProcess) addIntoCgroupV2() error {
310
311
}
311
312
312
313
func (p * setnsProcess ) addIntoCgroup () error {
314
+ if p .cmd .SysProcAttr .UseCgroupFD {
315
+ // We've used cgroupfd successfully, so the process is
316
+ // already in the proper cgroup, nothing to do here.
317
+ return nil
318
+ }
313
319
if cgroups .IsCgroup2UnifiedMode () {
314
320
return p .addIntoCgroupV2 ()
315
321
}
316
322
return p .addIntoCgroupV1 ()
317
323
}
318
324
325
+ // prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
326
+ // to join cgroup early, in p.cmd.Start. Returns an *os.File which
327
+ // must be closed by the caller after p.Cmd.Start return.
328
+ func (p * setnsProcess ) prepareCgroupFD () (* os.File , error ) {
329
+ if ! cgroups .IsCgroup2UnifiedMode () {
330
+ return nil , nil
331
+ }
332
+
333
+ base := p .manager .Path ("" )
334
+ if base == "" { // No cgroup to join.
335
+ return nil , nil
336
+ }
337
+ sub := ""
338
+ if p .process .SubCgroupPaths != nil {
339
+ sub = p .process .SubCgroupPaths ["" ]
340
+ }
341
+ cgroup := path .Join (base , sub )
342
+ if ! strings .HasPrefix (cgroup , base ) {
343
+ return nil , fmt .Errorf ("bad sub cgroup path: %s" , sub )
344
+ }
345
+
346
+ fd , err := os .OpenFile (cgroup , unix .O_PATH | unix .O_DIRECTORY | unix .O_CLOEXEC , 0 )
347
+ if err != nil {
348
+ if p .rootlessCgroups {
349
+ return nil , nil
350
+ }
351
+ return nil , fmt .Errorf ("can't open cgroup: %w" , err )
352
+ }
353
+
354
+ logrus .Debugf ("using CLONE_INTO_CGROUP %q" , cgroup )
355
+ if p .cmd .SysProcAttr == nil {
356
+ p .cmd .SysProcAttr = & syscall.SysProcAttr {}
357
+ }
358
+ p .cmd .SysProcAttr .UseCgroupFD = true
359
+ p .cmd .SysProcAttr .CgroupFD = int (fd .Fd ())
360
+
361
+ return fd , nil
362
+ }
363
+
364
+ // shouldRetryWithoutCgroupFD tells if the error returned from p.cmd.Start
365
+ // could be caused by using cgroupfd.
366
+ func (p * setnsProcess ) shouldRetryWithoutCgroupFD (err error ) bool {
367
+ if err == nil || ! p .cmd .SysProcAttr .UseCgroupFD {
368
+ return false
369
+ }
370
+ logrus .Debugf ("exec with CLONE_INTO_CGROUP failed: %v" , err )
371
+
372
+ switch {
373
+ // Cgroup in which a domain controller is enabled.
374
+ case errors .Is (err , unix .EBUSY ):
375
+ return true
376
+ // The cgroup is in the domain invalid state.
377
+ case errors .Is (err , unix .EOPNOTSUPP ):
378
+ return true
379
+ // Rootless with no direct access to cgroup.
380
+ case p .rootlessCgroups && errors .Is (err , unix .EACCES ):
381
+ return true
382
+ // No clone3 syscall (kernels < v5.3).
383
+ case errors .Is (err , unix .ENOSYS ):
384
+ return true
385
+ // No CLONE_INTO_CGROUP flag support (kernels v5.3 to v5.7).
386
+ case errors .Is (err , unix .E2BIG ):
387
+ return true
388
+ }
389
+
390
+ return false
391
+ }
392
+
319
393
func (p * setnsProcess ) start () (retErr error ) {
320
394
defer p .comm .closeParent ()
321
395
396
+ fd , err := p .prepareCgroupFD ()
397
+ if err != nil {
398
+ return err
399
+ }
400
+
322
401
// Get the "before" value of oom kill count.
323
402
oom , _ := p .manager .OOMKillCount ()
324
- err := p .startWithCPUAffinity ()
403
+
404
+ err = p .startWithCPUAffinity ()
405
+ if fd != nil {
406
+ fd .Close ()
407
+ }
408
+ if p .shouldRetryWithoutCgroupFD (err ) {
409
+ // SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
410
+ p .cmd .SysProcAttr .UseCgroupFD = false
411
+ err = p .startWithCPUAffinity ()
412
+ }
413
+
325
414
// Close the child-side of the pipes (controlled by child).
326
415
p .comm .closeChild ()
327
416
if err != nil {
0 commit comments