From 99b95b626c4fba4212941402a751d66dd91c2b67 Mon Sep 17 00:00:00 2001 From: ningmingxiao Date: Wed, 16 Jul 2025 15:40:49 +0800 Subject: [PATCH] fix:cpu affinity Signed-off-by: ningmingxiao --- .github/workflows/test.yml | 1 + libcontainer/process_linux.go | 44 +++++++++++++++++++++++++++ libcontainer/utils/utils.go | 31 +++++++++++++++++++ libcontainer/utils/utils_test.go | 46 +++++++++++++++++++++++++++++ tests/integration/cpu_affinity.bats | 17 +++++++++++ 5 files changed, 139 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d61efd151e1..b7a8878c506 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -169,6 +169,7 @@ jobs: - name: integration test (systemd driver) run: | + sudo taskset -pc 0-1 1 # Delegate all cgroup v2 controllers to rootless user via --systemd-cgroup. # The default (since systemd v252) is "pids memory cpu". sudo mkdir -p /etc/systemd/system/user@.service.d diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 386b5f76200..381043d9ee7 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -198,6 +198,17 @@ func (p *setnsProcess) setFinalCPUAffinity() error { return nil } +func (p *setnsProcess) hasExecCPUAffinity() bool { + aff := p.config.CPUAffinity + if aff == nil { + return false + } + if aff.Initial != nil || aff.Final != nil { + return true + } + return false +} + func (p *setnsProcess) start() (retErr error) { defer p.comm.closeParent() @@ -258,6 +269,13 @@ func (p *setnsProcess) start() (retErr error) { if err := p.setFinalCPUAffinity(); err != nil { return err } + + if !p.hasExecCPUAffinity() { + if err := resetAffinityMask(p.pid()); err != nil { + return err + } + } + if p.intelRdtPath != "" { // if Intel RDT "resource control" filesystem path exists _, err := os.Stat(p.intelRdtPath) @@ -615,6 +633,11 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("unable to apply cgroup configuration: %w", err) } } + + if err := resetAffinityMask(p.pid()); err != nil { + return err + } + if p.intelRdtManager != nil { if err := p.intelRdtManager.Apply(p.pid()); err != nil { return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) @@ -981,3 +1004,24 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { } return i, nil } + +// Set all inherited cpu affinity. Old kernels do that automatically, but +// new kernels remember the affinity that was set before the cgroup move. +// This is undesirable, because it inherits the systemd affinity when the container +// should really move to the container space cpus. +// here we can't use runtime.NumCPU() to get cpu counts because it call sched_getaffinity to get cpu counts. +// If systemd set CPUAffinity then use runtime.NumCPU() can't get real cpu counts. +func resetAffinityMask(pid int) error { + cpus, err := utils.SystemCPUCores() + if err != nil { + return err + } + cpuset := unix.CPUSet{} + for i := 0; i < int(cpus); i++ { + cpuset.Set(i) + } + if err := unix.SchedSetaffinity(pid, &cpuset); err != nil { + return fmt.Errorf("error resetting pid %d affinity: %w", pid, err) + } + return nil +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 442c02685bf..1274d0de181 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -1,7 +1,9 @@ package utils import ( + "bufio" "encoding/json" + "fmt" "io" "os" "path/filepath" @@ -113,3 +115,32 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str } return } + +// SystemCPUCores parses CPU usage information from a reader providing +// /proc/stat format data. It returns the number of CPUs. +func SystemCPUCores() (cpuNum uint32, _ error) { + f, err := os.Open("/proc/stat") + if err != nil { + return 0, err + } + defer f.Close() + return readSystemCPU(f) +} + +func readSystemCPU(r io.Reader) (cpuNum uint32, _ error) { + reader := bufio.NewReader(r) + for { + line, err := reader.ReadString('\n') + if err != nil { + return 0, fmt.Errorf("error scanning /proc/stat file: %w", err) + } + // just count the line start with cpuN(N is cpu No) + if line[:3] != "cpu" { + break + } + if '0' <= line[3] && line[3] <= '9' { + cpuNum++ + } + } + return cpuNum, nil +} diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go index 06c042f5fe3..e235e350251 100644 --- a/libcontainer/utils/utils_test.go +++ b/libcontainer/utils/utils_test.go @@ -2,6 +2,7 @@ package utils import ( "bytes" + "os" "testing" "golang.org/x/sys/unix" @@ -137,3 +138,48 @@ func TestStripRoot(t *testing.T) { } } } + +func TestSystemCPUCores(t *testing.T) { + t.Run("MultiCore", func(t *testing.T) { + content := `cpu 5263854 3354 5436110 61362568 22532 728994 208644 796742 0 0 +cpu0 720149 490 674391 7571042 4601 103938 42990 109735 0 0 +cpu1 595284 389 676327 7761080 2405 77856 25882 95566 0 0 +cpu2 727310 508 693322 7562543 3426 102842 28396 105651 0 0 +cpu3 601561 304 685817 7751082 2064 80219 17547 92322 0 0 +cpu4 713033 504 669261 7586506 2850 105624 39150 106688 0 0 +cpu5 595065 328 683341 7761812 2065 77750 17827 91675 0 0 +cpu6 720528 458 676161 7595093 3007 101744 21132 103530 0 0 +cpu7 590922 371 677486 7773406 2111 79018 15716 91570 0 0 +intr 1997458243 37 333 0 0 0 0 3 0 1 0 0 0 183 0 0 90125 0 0 0 0 0 0 0 0 0 458484 0 361539 0 0 0 256 0 1956792 15 0 918260 6 1450411 256422 0 49025 195 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ctxt 2640704037 +btime 1752714561 +processes 5253419 +procs_running 2 +procs_blocked 0 +softirq 580996229 23 230614056 282 2160733 45109 0 40037 116656548 0 231479441 +` + tmpfile, err := os.CreateTemp("", "stat") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpfile.Name()) + + if _, err := tmpfile.WriteString(content); err != nil { + t.Fatal(err) + } + if err := tmpfile.Close(); err != nil { + t.Fatal(err) + } + f, err := os.Open(tmpfile.Name()) + if err != nil { + t.Fatal(err) + } + count, err := readSystemCPU(f) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if count != 8 { + t.Errorf("expected 8 cores, got %d", count) + } + }) +} diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats index f6adfa2aebd..30f89433ab4 100644 --- a/tests/integration/cpu_affinity.bats +++ b/tests/integration/cpu_affinity.bats @@ -99,3 +99,20 @@ function cpus_to_mask() { [[ "$output" == *"nsexec"*": affinity: $mask"* ]] [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab. } + +@test "runc exec [CPU affinity set from config.json]" { + update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list", "/proc/self/status"]' + cpus=$(grep -c "^processor" /proc/cpuinfo) + cpus_minus_one=$((cpus - 1)) + runc run ct1 + [ "$status" -eq 0 ] + last_col=$(echo "$output" | awk '{print $NF}') + [[ "$last_col" == *"0-$cpus_minus_one"* ]] # Mind the literal tab. + update_config '.process.args = ["/bin/sleep", "100"]' + runc run -d --console-socket "$CONSOLE_SOCKET" ct2 + [ "$status" -eq 0 ] + runc exec ct2 grep -F "Cpus_allowed_list:" /proc/self/status + [ "$status" -eq 0 ] + last_col=$(echo "$output" | awk '{print $NF}') + [[ "$last_col" == *"0-$cpus_minus_one"* ]] # Mind the literal tab. +}