Skip to content

Commit 8590616

Browse files
committed
fix container os >= 3.4 && support ubuntu
Signed-off-by: bingshen.wbs <[email protected]>
1 parent f0342ea commit 8590616

File tree

8 files changed

+272
-213
lines changed

8 files changed

+272
-213
lines changed

Dockerfile

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ COPY cmd/ ./cmd/
1515
COPY api/ ./api/
1616
COPY internal/ ./internal/
1717

18-
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
18+
RUN export CGO_ENABLED=0; export GOOS=${TARGETOS:-linux}; export GOARCH=${TARGETARCH}; \
1919
go build -a -o manager ./cmd/controller/main.go && \
2020
go build -a -o agent ./cmd/agent/main.go && \
2121
go build -a -o smcr_init ./cmd/smcr_init/main.go
@@ -28,13 +28,20 @@ USER 65532:65532
2828

2929
ENTRYPOINT ["/manager"]
3030

31+
# for local build speedup
32+
FROM alibaba-cloud-linux-3-registry.cn-hangzhou.cr.aliyuncs.com/alinux3/alinux3 AS public_mirror
33+
RUN sed -i 's/mirrors.cloud.aliyuncs.com/mirrors.aliyun.com/g' /etc/yum.repos.d/*
34+
3135
FROM alibaba-cloud-linux-3-registry.cn-hangzhou.cr.aliyuncs.com/alinux3/alinux3 AS smcr_init
32-
RUN sed -i 's/mirrors.cloud.aliyuncs.com/mirrors.aliyun.com/g' /etc/yum.repos.d/*; yum install -y smc-tools && yum clean all && rm -rf /var/cache/* /var/lib/dnf/history* /var/lib/rpm/rpm.sqlite
36+
RUN yum install -y smc-tools && yum clean all && rm -rf /var/cache/* /var/lib/dnf/history* /var/lib/rpm/rpm.sqlite
3337
COPY --from=builder /workspace/smcr_init /usr/local/bin/smcr_init
3438
ENTRYPOINT ["/usr/local/bin/smcr_init"]
3539

3640
FROM alibaba-cloud-linux-3-registry.cn-hangzhou.cr.aliyuncs.com/alinux3/alinux3 AS agent
37-
RUN sed -i 's/mirrors.cloud.aliyuncs.com/mirrors.aliyun.com/g' /etc/yum.repos.d/*; yum install -y smc-tools procps-ng && yum clean all && rm -rf /var/cache/* /var/lib/dnf/history* /var/lib/rpm/rpm.sqlite
41+
RUN --mount=type=bind,from=public_mirror,source=/etc/yum.repos.d,target=/etc/yum.repos.d \
42+
yum install -y smc-tools procps-ng kmod wget tar && yum clean all && rm -rf /var/cache/* /var/lib/dnf/history* /var/lib/rpm/rpm.sqlite
43+
# for lifsea erdma driver install
44+
COPY hack/lifsea.repo /etc/yum.repos.d/
3845
COPY --from=builder /workspace/agent /usr/local/bin/agent
3946
COPY --from=builder /workspace/smcr_init /usr/local/bin/smcr_init
4047
ENTRYPOINT ["/usr/local/bin/agent"]

hack/lifsea.repo

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[alinux3-lifsea]
2+
name=alinux3-lifsea
3+
baseurl=http://mirrors.cloud.aliyuncs.com/alinux/$releasever/lifsea/$basearch/
4+
http://mirrors.aliyun.com/alinux/$releasever/lifsea/$basearch/
5+
gpgcheck=1
6+
enabled=1
7+
gpgkey=http://mirrors.cloud.aliyuncs.com/alinux/3/lifsea/RPM-GPG-KEY-ALINUX-3
8+
http://mirrors.aliyun.com/alinux/3/lifsea/RPM-GPG-KEY-ALINUX-3
9+
excludepkgs=systemd*,rpcbind,container-selinux,rpm-ostree*,glibc*,ostree*,nfs-utils,libnfsidmap*,lifsea-release

internal/drivers/compat.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,24 @@ func init() {
1313
Register("compat", &CompatDriver{})
1414
}
1515

16-
var compatInstallScript = `
17-
if [ -d /sys/fs/cgroup/cpu/ ]; then cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cpu/tasks && cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/memory/tasks; else
18-
cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cgroup.procs; fi &&
19-
if grep -q "Alibaba Cloud Linux Lifsea" /etc/os-release; then lifseacli pkg install kernel-modules-$(uname -r); modprobe erdma compat_mode=Y; else cd /tmp && rm -f erdma_installer-1.4.0.tar.gz &&
20-
wget 'http://mirrors.cloud.aliyuncs.com/erdma/erdma_installer-1.4.0.tar.gz' && tar -xzvf erdma_installer-1.4.0.tar.gz && cd erdma_installer && yum install -y kernel-devel-$(uname -r) gcc-c++ dkms cmake && ERDMA_CM_NO_BOUND_IF=1 ERDMA_FORCE_MAD_ENABLE=1 ./install.sh --batch; fi
21-
`
22-
2316
type CompatDriver struct{}
2417

2518
func (d *CompatDriver) Install() error {
2619
exist := driverExists()
2720
if !exist {
28-
_, err := hostExec(compatInstallScript)
29-
if err != nil {
30-
return err
21+
if isContainerOS() {
22+
err := containerOSDriverInstall(true)
23+
if err != nil {
24+
return err
25+
}
26+
} else {
27+
_, err := hostExec(getInstallScript(true))
28+
if err != nil {
29+
return err
30+
}
3131
}
3232
}
33-
_, err := hostExec("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"N\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=Y; else modprobe erdma compat_mode=Y; fi")
33+
_, err := containerExec("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"N\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=Y; else modprobe erdma compat_mode=Y; fi")
3434
if err != nil {
3535
return fmt.Errorf("install erdma driver failed: %v", err)
3636
}

internal/drivers/default.go

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,24 @@ func init() {
1313
Register(defaultDriver, &DefaultDriver{})
1414
}
1515

16-
var defaultInstallScript = `
17-
if [ -d /sys/fs/cgroup/cpu/ ]; then cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cpu/tasks && cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/memory/tasks; else
18-
cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cgroup.procs; fi &&
19-
if grep -q "Alibaba Cloud Linux Lifsea" /etc/os-release; then lifseacli pkg install kernel-modules-$(uname -r); modprobe erdma; else
20-
cd /tmp && rm -f erdma_installer-1.4.0.tar.gz &&
21-
wget 'http://mirrors.cloud.aliyuncs.com/erdma/erdma_installer-1.4.0.tar.gz' && tar -xzvf erdma_installer-1.4.0.tar.gz && cd erdma_installer && yum install -y kernel-devel-$(uname -r) gcc-c++ dkms cmake && ERDMA_CM_NO_BOUND_IF=1 ./install.sh --batch; fi
22-
`
23-
2416
type DefaultDriver struct{}
2517

2618
func (d *DefaultDriver) Install() error {
2719
exist := driverExists()
2820
if !exist {
29-
_, err := hostExec(defaultInstallScript)
30-
if err != nil {
31-
return err
21+
if isContainerOS() {
22+
err := containerOSDriverInstall(false)
23+
if err != nil {
24+
return err
25+
}
26+
} else {
27+
_, err := hostExec(getInstallScript(false))
28+
if err != nil {
29+
return err
30+
}
3231
}
3332
}
34-
_, err := hostExec("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"Y\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=N; else modprobe erdma compat_mode=N; fi")
33+
_, err := containerExec("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"Y\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=N; else modprobe erdma compat_mode=N; fi")
3534
if err != nil {
3635
return fmt.Errorf("install erdma driver failed: %v", err)
3736
}

internal/drivers/ofed.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,23 @@ cd /tmp && rm -f env_setup.sh && wget http://mirrors.cloud.aliyuncs.com/erdma/en
2020
type OFEDDriver struct{}
2121

2222
func (d *OFEDDriver) Install() error {
23+
execMethod := hostExec
24+
if isContainerOS() {
25+
execMethod = containerExec
26+
}
2327
exist := driverExists()
2428
if !exist {
25-
_, err := hostExec(gpuInstallScript)
29+
_, err := execMethod(gpuInstallScript)
2630
if err != nil {
2731
return err
2832
}
2933
}
30-
_, err := hostExec("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"N\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=Y; else modprobe erdma compat_mode=Y; fi")
34+
_, err := execMethod("if [ -f /sys/module/erdma/parameters/compat_mode ] && [ \"N\" == $(cat /sys/module/erdma/parameters/compat_mode) ]; then rmmod erdma && modprobe erdma compat_mode=Y; else modprobe erdma compat_mode=Y; fi")
3135
if err != nil {
3236
return fmt.Errorf("install erdma driver failed: %v", err)
3337
}
3438

35-
_, err = hostExec("modprobe erdma")
39+
_, err = execMethod("modprobe erdma")
3640
if err != nil {
3741
return fmt.Errorf("install erdma driver failed: %v", err)
3842
}

internal/drivers/utils.go

Lines changed: 14 additions & 184 deletions
Original file line numberDiff line numberDiff line change
@@ -1,198 +1,28 @@
1-
//go:build linux
2-
31
package drivers
42

53
import (
6-
"bytes"
74
"fmt"
8-
"io/fs"
9-
"net"
10-
"os"
115
"os/exec"
12-
"path"
13-
"path/filepath"
14-
"strconv"
15-
"strings"
16-
17-
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types"
18-
"github.com/samber/lo"
19-
"github.com/vishvananda/netlink"
20-
)
21-
22-
func driverExists() bool {
23-
if isContainerOS() {
24-
_, err := hostExec("modinfo erdma")
25-
if err != nil {
26-
driverLog.Info("driver not exists", "checklog", err)
27-
return false
28-
}
29-
return true
30-
}
31-
_, err := hostExec("stat /bin/eadm && modinfo erdma")
32-
if err != nil {
33-
driverLog.Info("driver not exists", "checklog", err)
34-
return false
35-
}
36-
return true
37-
}
38-
39-
func isContainerOS() bool {
40-
_, err := hostExec("grep -q \"Alibaba Cloud Linux Lifsea\" /etc/os-release")
41-
return err == nil
42-
}
43-
44-
//nolint:unparam
45-
func hostExec(cmd string) (string, error) {
46-
output, err := exec.Command("nsenter", "-t", "1", "-m", "--", "bash", "-c", cmd).CombinedOutput()
47-
if err != nil {
48-
return "", fmt.Errorf("exec error: %v, output: %s", err, string(output))
49-
}
50-
return string(output), nil
51-
}
52-
53-
func EnsureSMCR() error {
54-
_, err := hostExec("which smcss || yum install -y smc-tools || apt install -y smc-tools || lifseacli pkg install smc-tools")
55-
if err != nil {
56-
return err
57-
}
58-
_, err = hostExec("modprobe smc")
59-
if err != nil {
60-
return err
61-
}
62-
return nil
63-
}
64-
65-
func GetERdmaDevPathsFromRdmaLink(rdmaLink *netlink.RdmaLink) ([]string, error) {
66-
var devPaths []string
67-
ibUverbsDevs, err := os.ReadDir("/sys/class/infiniband_verbs/")
68-
if err != nil {
69-
return nil, fmt.Errorf("read dir /sys/class/infiniband_verbs/ failed: %v", err)
70-
}
71-
lo.ForEach(ibUverbsDevs, func(ibUverbsDev fs.DirEntry, _ int) {
72-
ibDevPath := filepath.Join("/sys/class/infiniband_verbs/", ibUverbsDev.Name(), "ibdev")
73-
driverLog.Info("check infiniband path", "path", ibDevPath)
74-
if _, err = os.Stat(ibDevPath); err == nil {
75-
if devName, err := os.ReadFile(ibDevPath); err == nil {
76-
devNameStr := strings.Trim(string(devName), "\n")
77-
driverLog.Info("infiniband device", "devName", devNameStr)
78-
if devNameStr == rdmaLink.Attrs.Name {
79-
devPaths = append(devPaths, filepath.Join("/dev/infiniband", ibUverbsDev.Name()))
80-
}
81-
}
82-
}
83-
})
84-
if len(devPaths) == 0 {
85-
return nil, fmt.Errorf("can not find dev path for %s", rdmaLink.Attrs.Name)
86-
}
87-
88-
if _, err := os.Stat("/dev/infiniband/rdma_cm"); err == nil {
89-
devPaths = append(devPaths, "/dev/infiniband/rdma_cm")
90-
}
91-
return devPaths, nil
92-
}
93-
func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) {
94-
rdmaLinks, err := netlink.RdmaLinkList()
95-
if err != nil {
96-
return nil, fmt.Errorf("error list rdma links, %v", err)
97-
}
98-
linkHwAddr := link.Attrs().HardwareAddr
99-
// erdma guid first byte is ^= 0x2
100-
linkHwAddr[0] ^= 0x2
101-
for _, rl := range rdmaLinks {
102-
rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid)
103-
if err != nil {
104-
return nil, err
105-
}
106-
driverLog.Info("check rdma link", "rdmaLink", rl.Attrs.Name, "rdmaHwAddr", rdmaHwAddr.String(), "linkHwAddr", linkHwAddr.String())
107-
if rdmaHwAddr.String() == linkHwAddr.String() {
108-
return rl, nil
109-
}
110-
}
111-
return nil, fmt.Errorf("cannot found rdma link for %s", link.Attrs().Name)
112-
}
113-
114-
func parseERdmaLinkHwAddr(guid string) (net.HardwareAddr, error) {
115-
hwAddrSlice := make([]byte, 8)
116-
guidSlice := strings.Split(guid, ":")
117-
if len(guidSlice) != 8 {
118-
return nil, fmt.Errorf("invalid rdma guid: %s", guid)
119-
}
120-
for i, s := range guidSlice {
121-
sint, err := strconv.ParseUint(s, 16, 8)
122-
if err != nil {
123-
return nil, fmt.Errorf("invalid rdma guid: %s, err: %v", guid, err)
124-
}
125-
hwAddrSlice[7-i] = uint8(sint)
126-
}
127-
return append(hwAddrSlice[0:3], hwAddrSlice[5:8]...), nil
128-
}
129-
130-
const (
131-
smcPnet = "smc_pnet"
1326
)
1337

134-
func ConfigSMCPnetForDevice(info *types.ERdmaDeviceInfo) error {
135-
output, err := exec.Command(smcPnet, "-s").CombinedOutput()
136-
if err != nil {
137-
return fmt.Errorf("failed to get smc-pnet stat: %v, output: %v", err, string(output))
138-
}
139-
if bytes.Contains(output, []byte(PNetIDFromDevice(info))) {
140-
return nil
8+
func getInstallScript(compat bool) string {
9+
script := `if [ -d /sys/fs/cgroup/cpu/ ]; then cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cpu/tasks && cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/memory/tasks; else
10+
cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cgroup.procs; fi && cd /tmp && rm -f erdma_installer-1.4.6.tar.gz &&
11+
wget 'http://mirrors.cloud.aliyuncs.com/erdma/erdma_installer-1.4.6.tar.gz' && tar -xzvf erdma_installer-1.4.6.tar.gz && cd erdma_installer &&
12+
(type yum && yum install -y kernel-devel-$(uname -r) gcc-c++ dkms cmake) || (apt update && apt install -y debhelper autotools-dev dkms libnl-3-dev libnl-route-3-dev cmake) &&
13+
ERDMA_CM_NO_BOUND_IF=1 %s ./install.sh --batch`
14+
if compat {
15+
return fmt.Sprintf(script, "ERDMA_FORCE_MAD_ENABLE=1")
14116
}
142-
output, err = exec.Command(smcPnet, "-a", PNetIDFromDevice(info), "-D", info.Name).CombinedOutput()
143-
if err != nil {
144-
return fmt.Errorf("failed to config smc-pnet rdma device: %v, output: %v", err, string(output))
145-
}
146-
return nil
147-
}
148-
149-
func PNetIDFromDevice(info *types.ERdmaDeviceInfo) string {
150-
return strings.ReplaceAll(strings.ToUpper(info.MAC), ":", "")
17+
return fmt.Sprintf(script, "")
15118
}
15219

153-
func ConfigForNetDevice(pnet string, netDevice string) error {
154-
output, err := exec.Command(smcPnet, "-s").CombinedOutput()
155-
if err != nil {
156-
return fmt.Errorf("failed to get smc-pnet stat for net device: %v, output: %v", err, string(output))
157-
}
158-
if bytes.Contains(output, []byte(netDevice)) {
159-
return nil
160-
}
161-
output, err = exec.Command(smcPnet, "-a", pnet, "-I", netDevice).CombinedOutput()
20+
func containerOSDriverInstall(compat bool) error {
21+
driverLog.Info("install driver in container os", "compat", compat)
22+
containerOSScript := `yum install -y kernel-modules-$(uname -r)`
23+
output, err := exec.Command("/usr/bin/bash", "-c", containerOSScript).CombinedOutput()
16224
if err != nil {
163-
return fmt.Errorf("failed to config smc-pnet net device: %v, output: %v", err, string(output))
25+
return fmt.Errorf("exec error: %v, output: %s", err, string(output))
16426
}
16527
return nil
16628
}
167-
168-
func ConfigForNetnsNetDevice(pnet string, netDevice string, netns string) error {
169-
output, err := exec.Command("nsenter", "-n/proc/1/root/"+netns, "--", smcPnet, "-s").CombinedOutput()
170-
if err != nil {
171-
return fmt.Errorf("failed to get smc-pnet stat for net device: %v, output: %v", err, string(output))
172-
}
173-
if bytes.Contains(output, []byte(netDevice)) {
174-
return nil
175-
}
176-
output, err = exec.Command("nsenter", "-n/proc/1/root/"+netns, "--", smcPnet, "-a", pnet, "-I", netDevice).CombinedOutput()
177-
if err != nil {
178-
return fmt.Errorf("failed to config smc-pnet net device: %v, output: %v", err, string(output))
179-
}
180-
return nil
181-
}
182-
183-
func GetERDMANumaNode(info *netlink.RdmaLink) (int64, error) {
184-
devNumaPath := path.Join("/sys/class/infiniband/", info.Attrs.Name, "device/numa_node")
185-
numaStr, err := os.ReadFile(devNumaPath)
186-
if err != nil {
187-
return -1, fmt.Errorf("failed to get numa node for %s: %v", info.Attrs.Name, err)
188-
}
189-
numaStr = bytes.Trim(numaStr, "\n")
190-
numa, err := strconv.Atoi(string(numaStr))
191-
if err != nil {
192-
return -1, fmt.Errorf("failed to parse numa node for %s: %v", info.Attrs.Name, err)
193-
}
194-
if numa < 0 {
195-
numa = 0
196-
}
197-
return int64(numa), nil
198-
}

0 commit comments

Comments
 (0)