Skip to content

Commit c6b7bc5

Browse files
committed
oci: Isolate cgroups under the current hierarchy
When buildkitd is run in a privileged container on Kubernetes, the `/sys/fs/cgroup` mount will be that of the host which allows buildkitd to remove itself from the cgroup hierarchy managed by Kubernetes (cgroupfs). When buildkitd's worker creates cgroups outside of the externally managed hierarchy, resource accounting is incorrect and resource limits are not enforced. This can lead to OOM and other CPU contention issues on nodes. Introduce a new `isolateCgroups` configuration for the OCI worker. If set, all cgroups are created beneath the cgroup hierarchy of the buildkitd process.
1 parent 474cae7 commit c6b7bc5

File tree

10 files changed

+91
-33
lines changed

10 files changed

+91
-33
lines changed

cmd/buildkitd/config/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ type OCIConfig struct {
122122
ProxySnapshotterPath string `toml:"proxySnapshotterPath"`
123123
DefaultCgroupParent string `toml:"defaultCgroupParent"`
124124

125+
// IsolateCgroups keeps all cgroups (including DefaultCgroupParent) under
126+
// the cgroup hierarchy of the buildkitd process.
127+
IsolateCgroups bool `toml:"isolateCgroups"`
128+
125129
// StargzSnapshotterConfig is configuration for stargz snapshotter.
126130
// We use a generic map[string]interface{} in order to remove the dependency
127131
// on stargz snapshotter's config pkg from our config.

cmd/buildkitd/main_oci_worker.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ func init() {
126126
Usage: "limit the number of parallel build steps that can run at the same time",
127127
Value: defaultConf.Workers.OCI.MaxParallelism,
128128
},
129+
cli.BoolFlag{
130+
Name: "oci-isolate-cgroups",
131+
Usage: "isolate cgroups to the cgroup hierarchy of the buildkitd process",
132+
},
129133
}
130134
n := "oci-worker-rootless"
131135
u := "enable rootless mode"
@@ -260,6 +264,9 @@ func applyOCIFlags(c *cli.Context, cfg *config.Config) error {
260264
if c.GlobalIsSet("oci-max-parallelism") {
261265
cfg.Workers.OCI.MaxParallelism = c.GlobalInt("oci-max-parallelism")
262266
}
267+
if c.GlobalIsSet("oci-isolate-cgroups") {
268+
cfg.Workers.OCI.IsolateCgroups = c.GlobalBool("oci-isolate-cgroups")
269+
}
263270

264271
return nil
265272
}
@@ -327,7 +334,7 @@ func ociWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([]worker
327334
parallelismSem = semaphore.NewWeighted(int64(cfg.MaxParallelism))
328335
}
329336

330-
opt, err := runc.NewWorkerOpt(common.config.Root, snFactory, cfg.Rootless, processMode, cfg.Labels, idmapping, nc, dns, cfg.Binary, cfg.ApparmorProfile, cfg.SELinux, parallelismSem, common.traceSocket, cfg.DefaultCgroupParent, cdiManager)
337+
opt, err := runc.NewWorkerOpt(common.config.Root, snFactory, cfg.Rootless, processMode, cfg.Labels, idmapping, nc, dns, cfg.Binary, cfg.ApparmorProfile, cfg.SELinux, parallelismSem, common.traceSocket, cfg.DefaultCgroupParent, cfg.IsolateCgroups, cdiManager)
331338
if err != nil {
332339
return nil, err
333340
}

docs/buildkitd.toml.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ provenanceEnvDir = "/etc/buildkit/provenance.d"
102102
# maintain a pool of reusable CNI network namespaces to amortize the overhead
103103
# of allocating and releasing the namespaces
104104
cniPoolSize = 16
105+
# defaultCgroupParent sets the parent cgroup of all containers.
106+
defaultCgroupParent = "buildkit"
107+
# isolateCgroups keeps all buildkitd managed cgroups under the cgroup
108+
# hierarchy of the buildkitd process. if you are running buildkitd in
109+
# Kubernetes, set this to true to ensure resource limits work as expected.
110+
isolateCgroups = true
105111

106112
[worker.oci.labels]
107113
"foo" = "bar"

executor/containerdexecutor/executor_unix.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ func (w *containerdExecutor) createOCISpec(ctx context.Context, id, resolvConf,
140140
}
141141

142142
processMode := oci.ProcessSandbox // FIXME(AkihiroSuda)
143-
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, resolvConf, hostsFile, namespace, w.cgroupParent, processMode, nil, w.apparmorProfile, w.selinux, w.traceSocket, w.cdiManager, opts...)
143+
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, resolvConf, hostsFile, namespace, w.cgroupParent, "", processMode, nil, w.apparmorProfile, w.selinux, w.traceSocket, w.cdiManager, opts...)
144144
if err != nil {
145145
releaseAll()
146146
return nil, nil, err

executor/containerdexecutor/executor_windows.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func (w *containerdExecutor) createOCISpec(ctx context.Context, id, _, _ string,
8585
}
8686

8787
processMode := oci.ProcessSandbox // FIXME(AkihiroSuda)
88-
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, "", "", namespace, "", processMode, nil, "", false, w.traceSocket, nil, opts...)
88+
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, "", "", namespace, "", "", processMode, nil, "", false, w.traceSocket, nil, opts...)
8989
if err != nil {
9090
releaseAll()
9191
return nil, nil, err

executor/oci/spec.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ const (
3838
// Note that NoProcessSandbox allows build containers to kill (and potentially ptrace) an arbitrary process in the BuildKit host namespace.
3939
// NoProcessSandbox should be enabled only when the BuildKit is running in a container as an unprivileged user.
4040
NoProcessSandbox
41+
42+
// cgroupNamespace is the cgroup under which container cgroups are created.
43+
cgroupNamespace = "buildkit"
4144
)
4245

4346
var tracingEnvVars = []string{
@@ -61,7 +64,7 @@ func (pm ProcessMode) String() string {
6164

6265
// GenerateSpec generates spec using containerd functionality.
6366
// opts are ignored for s.Process, s.Hostname, and s.Mounts .
64-
func GenerateSpec(ctx context.Context, meta executor.Meta, mounts []executor.Mount, id, resolvConf, hostsFile string, namespace network.Namespace, cgroupParent string, processMode ProcessMode, idmap *user.IdentityMapping, apparmorProfile string, selinuxB bool, tracingSocket string, cdiManager *cdidevices.Manager, opts ...oci.SpecOpts) (*specs.Spec, func(), error) {
67+
func GenerateSpec(ctx context.Context, meta executor.Meta, mounts []executor.Mount, id, resolvConf, hostsFile string, namespace network.Namespace, cgroupParent string, cgroupRoot string, processMode ProcessMode, idmap *user.IdentityMapping, apparmorProfile string, selinuxB bool, tracingSocket string, cdiManager *cdidevices.Manager, opts ...oci.SpecOpts) (*specs.Spec, func(), error) {
6568
c := &containers.Container{
6669
ID: id,
6770
}
@@ -75,15 +78,17 @@ func GenerateSpec(ctx context.Context, meta executor.Meta, mounts []executor.Mou
7578
if strings.Contains(cgroupParent, ".slice") && lastSeparator == ":" {
7679
cgroupsPath = cgroupParent + id
7780
} else {
78-
cgroupsPath = filepath.Join("/", cgroupParent, "buildkit", id)
81+
cgroupsPath = filepath.Join(cgroupParent, cgroupNamespace, id)
7982
}
80-
opts = append(opts, oci.WithCgroup(cgroupsPath))
83+
opts = append(opts, oci.WithCgroup(filepath.Join("/", cgroupRoot, cgroupsPath)))
84+
} else if cgroupRoot != "" {
85+
opts = append(opts, oci.WithCgroup(filepath.Join("/", cgroupRoot, cgroupNamespace, id)))
8186
}
8287

8388
// containerd/oci.GenerateSpec requires a namespace, which
8489
// will be used to namespace specs.Linux.CgroupsPath if generated
8590
if _, ok := namespaces.Namespace(ctx); !ok {
86-
ctx = namespaces.WithNamespace(ctx, "buildkit")
91+
ctx = namespaces.WithNamespace(ctx, cgroupNamespace)
8792
}
8893

8994
opts = append(opts, generateMountOpts(resolvConf, hostsFile)...)

executor/resources/monitor.go

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,11 @@ func (r *nopRecord) Start() {
174174
}
175175

176176
type Monitor struct {
177-
mu sync.Mutex
178-
closed chan struct{}
179-
records map[string]*cgroupRecord
180-
proc procfs.FS
177+
mu sync.Mutex
178+
closed chan struct{}
179+
records map[string]*cgroupRecord
180+
proc procfs.FS
181+
rootCgroup string
181182
}
182183

183184
type NetworkSampler interface {
@@ -221,63 +222,92 @@ func (m *Monitor) Close() error {
221222
return nil
222223
}
223224

224-
func NewMonitor() (*Monitor, error) {
225+
func (m *Monitor) RootCgroup() string {
226+
return m.rootCgroup
227+
}
228+
229+
func NewMonitor(isolateCgroups bool) (*Monitor, error) {
230+
fs, err := procfs.NewDefaultFS()
231+
if err != nil {
232+
return nil, err
233+
}
234+
235+
rootCgroup := ""
236+
225237
initOnce.Do(func() {
226238
isCgroupV2 = isCgroup2()
227239
if !isCgroupV2 {
228240
return
229241
}
230-
if err := prepareCgroupControllers(); err != nil {
242+
243+
cgroupPath := defaultMountpoint
244+
245+
if isolateCgroups {
246+
proc, err := fs.Self()
247+
if err != nil {
248+
bklog.L.Warnf("failed to get current process info: %+v", err)
249+
return
250+
}
251+
252+
cgroups, err := proc.Cgroups()
253+
if err != nil {
254+
bklog.L.Warnf("failed to get current cgroups: %+v", err)
255+
return
256+
}
257+
258+
if len(cgroups) > 0 {
259+
rootCgroup = cgroups[0].Path
260+
cgroupPath = filepath.Join(cgroupPath, cgroups[0].Path)
261+
}
262+
}
263+
264+
if err := prepareCgroupControllers(cgroupPath); err != nil {
231265
bklog.L.Warnf("failed to prepare cgroup controllers: %+v", err)
232266
}
233267
})
234268

235-
fs, err := procfs.NewDefaultFS()
236-
if err != nil {
237-
return nil, err
238-
}
239-
240269
return &Monitor{
241-
closed: make(chan struct{}),
242-
records: make(map[string]*cgroupRecord),
243-
proc: fs,
270+
rootCgroup: rootCgroup,
271+
closed: make(chan struct{}),
272+
records: make(map[string]*cgroupRecord),
273+
proc: fs,
244274
}, nil
245275
}
246276

247-
func prepareCgroupControllers() error {
277+
func prepareCgroupControllers(cgroupPath string) error {
248278
v, ok := os.LookupEnv("BUILDKIT_SETUP_CGROUPV2_ROOT")
249279
if !ok {
250280
return nil
251281
}
252282
if b, _ := strconv.ParseBool(v); !b {
253283
return nil
254284
}
255-
// move current process to init cgroup
256-
if err := os.MkdirAll(filepath.Join(defaultMountpoint, initGroup), 0755); err != nil {
285+
// move current process to an init cgroup subgroup
286+
if err := os.MkdirAll(filepath.Join(cgroupPath, initGroup), 0755); err != nil {
257287
return err
258288
}
259-
f, err := os.OpenFile(filepath.Join(defaultMountpoint, cgroupProcsFile), os.O_RDONLY, 0)
289+
f, err := os.OpenFile(filepath.Join(cgroupPath, cgroupProcsFile), os.O_RDONLY, 0)
260290
if err != nil {
261291
return err
262292
}
263293
s := bufio.NewScanner(f)
264294
for s.Scan() {
265-
if err := os.WriteFile(filepath.Join(defaultMountpoint, initGroup, cgroupProcsFile), s.Bytes(), 0); err != nil {
295+
if err := os.WriteFile(filepath.Join(cgroupPath, initGroup, cgroupProcsFile), s.Bytes(), 0); err != nil {
266296
return err
267297
}
268298
}
269299
if err := f.Close(); err != nil {
270300
return err
271301
}
272-
dt, err := os.ReadFile(filepath.Join(defaultMountpoint, cgroupControllersFile))
302+
dt, err := os.ReadFile(filepath.Join(cgroupPath, cgroupControllersFile))
273303
if err != nil {
274304
return err
275305
}
276306
for c := range strings.SplitSeq(string(dt), " ") {
277307
if c == "" {
278308
continue
279309
}
280-
if err := os.WriteFile(filepath.Join(defaultMountpoint, cgroupSubtreeFile), []byte("+"+c), 0); err != nil {
310+
if err := os.WriteFile(filepath.Join(cgroupPath, cgroupSubtreeFile), []byte("+"+c), 0); err != nil {
281311
// ignore error
282312
bklog.L.Warnf("failed to enable cgroup controller %q: %+v", c, err)
283313
}

executor/runcexecutor/executor.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,11 @@ type Opt struct {
4545
CommandCandidates []string
4646
// without root privileges (has nothing to do with Opt.Root directory)
4747
Rootless bool
48-
// DefaultCgroupParent is the cgroup-parent name for executor
48+
// DefaultCgroupParent is the default cgroup-parent name for executor
4949
DefaultCgroupParent string
50+
// RootCgroup ensures all cgroups (including DefaultCgroupParent) are
51+
// created beneath the given root cgroup.
52+
RootCgroup string
5053
// ProcessMode
5154
ProcessMode oci.ProcessMode
5255
IdentityMapping *user.IdentityMapping
@@ -67,6 +70,7 @@ type runcExecutor struct {
6770
runc *runc.Runc
6871
root string
6972
cgroupParent string
73+
rootCgroup string
7074
rootless bool
7175
networkProviders map[pb.NetMode]network.Provider
7276
processMode oci.ProcessMode
@@ -135,6 +139,7 @@ func New(opt Opt, networkProviders map[pb.NetMode]network.Provider) (executor.Ex
135139
runc: runtime,
136140
root: root,
137141
cgroupParent: opt.DefaultCgroupParent,
142+
rootCgroup: opt.RootCgroup,
138143
rootless: opt.Rootless,
139144
networkProviders: networkProviders,
140145
processMode: opt.ProcessMode,
@@ -268,7 +273,7 @@ func (w *runcExecutor) Run(ctx context.Context, id string, root executor.Mount,
268273
}
269274
}
270275

271-
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, resolvConf, hostsFile, namespace, w.cgroupParent, w.processMode, w.idmap, w.apparmorProfile, w.selinux, w.tracingSocket, w.cdiManager, opts...)
276+
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, resolvConf, hostsFile, namespace, w.cgroupParent, w.rootCgroup, w.processMode, w.idmap, w.apparmorProfile, w.selinux, w.tracingSocket, w.cdiManager, opts...)
272277
if err != nil {
273278
return nil, err
274279
}

worker/runc/runc.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ type SnapshotterFactory struct {
4040
}
4141

4242
// NewWorkerOpt creates a WorkerOpt.
43-
func NewWorkerOpt(root string, snFactory SnapshotterFactory, rootless bool, processMode oci.ProcessMode, labels map[string]string, idmap *user.IdentityMapping, nopt netproviders.Opt, dns *oci.DNSConfig, binary, apparmorProfile string, selinux bool, parallelismSem *semaphore.Weighted, traceSocket, defaultCgroupParent string, cdiManager *cdidevices.Manager) (base.WorkerOpt, error) {
43+
func NewWorkerOpt(root string, snFactory SnapshotterFactory, rootless bool, processMode oci.ProcessMode, labels map[string]string, idmap *user.IdentityMapping, nopt netproviders.Opt, dns *oci.DNSConfig, binary, apparmorProfile string, selinux bool, parallelismSem *semaphore.Weighted, traceSocket, defaultCgroupParent string, isolateCgroups bool, cdiManager *cdidevices.Manager) (base.WorkerOpt, error) {
4444
var opt base.WorkerOpt
4545
name := "runc-" + snFactory.Name
4646
root = filepath.Join(root, name)
@@ -59,7 +59,7 @@ func NewWorkerOpt(root string, snFactory SnapshotterFactory, rootless bool, proc
5959
cmds = append(cmds, binary)
6060
}
6161

62-
rm, err := resources.NewMonitor()
62+
rm, err := resources.NewMonitor(isolateCgroups)
6363
if err != nil {
6464
return opt, err
6565
}
@@ -79,6 +79,7 @@ func NewWorkerOpt(root string, snFactory SnapshotterFactory, rootless bool, proc
7979
SELinux: selinux,
8080
TracingSocket: traceSocket,
8181
DefaultCgroupParent: defaultCgroupParent,
82+
RootCgroup: rm.RootCgroup(),
8283
ResourceMonitor: rm,
8384
CDIManager: cdiManager,
8485
}, np)

worker/runc/runc_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func newWorkerOpt(t *testing.T, processMode oci.ProcessMode) base.WorkerOpt {
4242
},
4343
}
4444
rootless := false
45-
workerOpt, err := NewWorkerOpt(tmpdir, snFactory, rootless, processMode, nil, nil, netproviders.Opt{Mode: "host"}, nil, "", "", false, nil, "", "", nil)
45+
workerOpt, err := NewWorkerOpt(tmpdir, snFactory, rootless, processMode, nil, nil, netproviders.Opt{Mode: "host"}, nil, "", "", false, nil, "", "", false, nil)
4646
require.NoError(t, err)
4747

4848
return workerOpt

0 commit comments

Comments
 (0)