|
1 |
| -//go:build linux |
2 |
| - |
3 | 1 | package drivers
|
4 | 2 |
|
5 | 3 | import (
|
6 |
| - "bytes" |
7 | 4 | "fmt"
|
8 |
| - "io/fs" |
9 |
| - "net" |
10 |
| - "os" |
11 | 5 | "os/exec"
|
12 |
| - "path" |
13 |
| - "path/filepath" |
14 |
| - "strconv" |
15 |
| - "strings" |
16 |
| - |
17 |
| - "github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types" |
18 |
| - "github.com/samber/lo" |
19 |
| - "github.com/vishvananda/netlink" |
20 |
| -) |
21 |
| - |
22 |
| -func driverExists() bool { |
23 |
| - if isContainerOS() { |
24 |
| - _, err := hostExec("modinfo erdma") |
25 |
| - if err != nil { |
26 |
| - driverLog.Info("driver not exists", "checklog", err) |
27 |
| - return false |
28 |
| - } |
29 |
| - return true |
30 |
| - } |
31 |
| - _, err := hostExec("stat /bin/eadm && modinfo erdma") |
32 |
| - if err != nil { |
33 |
| - driverLog.Info("driver not exists", "checklog", err) |
34 |
| - return false |
35 |
| - } |
36 |
| - return true |
37 |
| -} |
38 |
| - |
39 |
| -func isContainerOS() bool { |
40 |
| - _, err := hostExec("grep -q \"Alibaba Cloud Linux Lifsea\" /etc/os-release") |
41 |
| - return err == nil |
42 |
| -} |
43 |
| - |
44 |
| -//nolint:unparam |
45 |
| -func hostExec(cmd string) (string, error) { |
46 |
| - output, err := exec.Command("nsenter", "-t", "1", "-m", "--", "bash", "-c", cmd).CombinedOutput() |
47 |
| - if err != nil { |
48 |
| - return "", fmt.Errorf("exec error: %v, output: %s", err, string(output)) |
49 |
| - } |
50 |
| - return string(output), nil |
51 |
| -} |
52 |
| - |
53 |
| -func EnsureSMCR() error { |
54 |
| - _, err := hostExec("which smcss || yum install -y smc-tools || apt install -y smc-tools || lifseacli pkg install smc-tools") |
55 |
| - if err != nil { |
56 |
| - return err |
57 |
| - } |
58 |
| - _, err = hostExec("modprobe smc") |
59 |
| - if err != nil { |
60 |
| - return err |
61 |
| - } |
62 |
| - return nil |
63 |
| -} |
64 |
| - |
65 |
| -func GetERdmaDevPathsFromRdmaLink(rdmaLink *netlink.RdmaLink) ([]string, error) { |
66 |
| - var devPaths []string |
67 |
| - ibUverbsDevs, err := os.ReadDir("/sys/class/infiniband_verbs/") |
68 |
| - if err != nil { |
69 |
| - return nil, fmt.Errorf("read dir /sys/class/infiniband_verbs/ failed: %v", err) |
70 |
| - } |
71 |
| - lo.ForEach(ibUverbsDevs, func(ibUverbsDev fs.DirEntry, _ int) { |
72 |
| - ibDevPath := filepath.Join("/sys/class/infiniband_verbs/", ibUverbsDev.Name(), "ibdev") |
73 |
| - driverLog.Info("check infiniband path", "path", ibDevPath) |
74 |
| - if _, err = os.Stat(ibDevPath); err == nil { |
75 |
| - if devName, err := os.ReadFile(ibDevPath); err == nil { |
76 |
| - devNameStr := strings.Trim(string(devName), "\n") |
77 |
| - driverLog.Info("infiniband device", "devName", devNameStr) |
78 |
| - if devNameStr == rdmaLink.Attrs.Name { |
79 |
| - devPaths = append(devPaths, filepath.Join("/dev/infiniband", ibUverbsDev.Name())) |
80 |
| - } |
81 |
| - } |
82 |
| - } |
83 |
| - }) |
84 |
| - if len(devPaths) == 0 { |
85 |
| - return nil, fmt.Errorf("can not find dev path for %s", rdmaLink.Attrs.Name) |
86 |
| - } |
87 |
| - |
88 |
| - if _, err := os.Stat("/dev/infiniband/rdma_cm"); err == nil { |
89 |
| - devPaths = append(devPaths, "/dev/infiniband/rdma_cm") |
90 |
| - } |
91 |
| - return devPaths, nil |
92 |
| -} |
93 |
| -func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) { |
94 |
| - rdmaLinks, err := netlink.RdmaLinkList() |
95 |
| - if err != nil { |
96 |
| - return nil, fmt.Errorf("error list rdma links, %v", err) |
97 |
| - } |
98 |
| - linkHwAddr := link.Attrs().HardwareAddr |
99 |
| - // erdma guid first byte is ^= 0x2 |
100 |
| - linkHwAddr[0] ^= 0x2 |
101 |
| - for _, rl := range rdmaLinks { |
102 |
| - rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid) |
103 |
| - if err != nil { |
104 |
| - return nil, err |
105 |
| - } |
106 |
| - driverLog.Info("check rdma link", "rdmaLink", rl.Attrs.Name, "rdmaHwAddr", rdmaHwAddr.String(), "linkHwAddr", linkHwAddr.String()) |
107 |
| - if rdmaHwAddr.String() == linkHwAddr.String() { |
108 |
| - return rl, nil |
109 |
| - } |
110 |
| - } |
111 |
| - return nil, fmt.Errorf("cannot found rdma link for %s", link.Attrs().Name) |
112 |
| -} |
113 |
| - |
114 |
| -func parseERdmaLinkHwAddr(guid string) (net.HardwareAddr, error) { |
115 |
| - hwAddrSlice := make([]byte, 8) |
116 |
| - guidSlice := strings.Split(guid, ":") |
117 |
| - if len(guidSlice) != 8 { |
118 |
| - return nil, fmt.Errorf("invalid rdma guid: %s", guid) |
119 |
| - } |
120 |
| - for i, s := range guidSlice { |
121 |
| - sint, err := strconv.ParseUint(s, 16, 8) |
122 |
| - if err != nil { |
123 |
| - return nil, fmt.Errorf("invalid rdma guid: %s, err: %v", guid, err) |
124 |
| - } |
125 |
| - hwAddrSlice[7-i] = uint8(sint) |
126 |
| - } |
127 |
| - return append(hwAddrSlice[0:3], hwAddrSlice[5:8]...), nil |
128 |
| -} |
129 |
| - |
130 |
| -const ( |
131 |
| - smcPnet = "smc_pnet" |
132 | 6 | )
|
133 | 7 |
|
134 |
| -func ConfigSMCPnetForDevice(info *types.ERdmaDeviceInfo) error { |
135 |
| - output, err := exec.Command(smcPnet, "-s").CombinedOutput() |
136 |
| - if err != nil { |
137 |
| - return fmt.Errorf("failed to get smc-pnet stat: %v, output: %v", err, string(output)) |
138 |
| - } |
139 |
| - if bytes.Contains(output, []byte(PNetIDFromDevice(info))) { |
140 |
| - return nil |
| 8 | +func getInstallScript(compat bool) string { |
| 9 | + script := `if [ -d /sys/fs/cgroup/cpu/ ]; then cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cpu/tasks && cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/memory/tasks; else |
| 10 | +cat /proc/self/status | awk '/PPid:/{print $2}' > /sys/fs/cgroup/cgroup.procs; fi && cd /tmp && rm -f erdma_installer-1.4.6.tar.gz && |
| 11 | +wget 'http://mirrors.cloud.aliyuncs.com/erdma/erdma_installer-1.4.6.tar.gz' && tar -xzvf erdma_installer-1.4.6.tar.gz && cd erdma_installer && |
| 12 | +(type yum && yum install -y kernel-devel-$(uname -r) gcc-c++ dkms cmake) || (apt update && apt install -y debhelper autotools-dev dkms libnl-3-dev libnl-route-3-dev cmake) && |
| 13 | +ERDMA_CM_NO_BOUND_IF=1 %s ./install.sh --batch` |
| 14 | + if compat { |
| 15 | + return fmt.Sprintf(script, "ERDMA_FORCE_MAD_ENABLE=1") |
141 | 16 | }
|
142 |
| - output, err = exec.Command(smcPnet, "-a", PNetIDFromDevice(info), "-D", info.Name).CombinedOutput() |
143 |
| - if err != nil { |
144 |
| - return fmt.Errorf("failed to config smc-pnet rdma device: %v, output: %v", err, string(output)) |
145 |
| - } |
146 |
| - return nil |
147 |
| -} |
148 |
| - |
149 |
| -func PNetIDFromDevice(info *types.ERdmaDeviceInfo) string { |
150 |
| - return strings.ReplaceAll(strings.ToUpper(info.MAC), ":", "") |
| 17 | + return fmt.Sprintf(script, "") |
151 | 18 | }
|
152 | 19 |
|
153 |
| -func ConfigForNetDevice(pnet string, netDevice string) error { |
154 |
| - output, err := exec.Command(smcPnet, "-s").CombinedOutput() |
155 |
| - if err != nil { |
156 |
| - return fmt.Errorf("failed to get smc-pnet stat for net device: %v, output: %v", err, string(output)) |
157 |
| - } |
158 |
| - if bytes.Contains(output, []byte(netDevice)) { |
159 |
| - return nil |
160 |
| - } |
161 |
| - output, err = exec.Command(smcPnet, "-a", pnet, "-I", netDevice).CombinedOutput() |
| 20 | +func containerOSDriverInstall(compat bool) error { |
| 21 | + driverLog.Info("install driver in container os", "compat", compat) |
| 22 | + containerOSScript := `yum install -y kernel-modules-$(uname -r)` |
| 23 | + output, err := exec.Command("/usr/bin/bash", "-c", containerOSScript).CombinedOutput() |
162 | 24 | if err != nil {
|
163 |
| - return fmt.Errorf("failed to config smc-pnet net device: %v, output: %v", err, string(output)) |
| 25 | + return fmt.Errorf("exec error: %v, output: %s", err, string(output)) |
164 | 26 | }
|
165 | 27 | return nil
|
166 | 28 | }
|
167 |
| - |
168 |
| -func ConfigForNetnsNetDevice(pnet string, netDevice string, netns string) error { |
169 |
| - output, err := exec.Command("nsenter", "-n/proc/1/root/"+netns, "--", smcPnet, "-s").CombinedOutput() |
170 |
| - if err != nil { |
171 |
| - return fmt.Errorf("failed to get smc-pnet stat for net device: %v, output: %v", err, string(output)) |
172 |
| - } |
173 |
| - if bytes.Contains(output, []byte(netDevice)) { |
174 |
| - return nil |
175 |
| - } |
176 |
| - output, err = exec.Command("nsenter", "-n/proc/1/root/"+netns, "--", smcPnet, "-a", pnet, "-I", netDevice).CombinedOutput() |
177 |
| - if err != nil { |
178 |
| - return fmt.Errorf("failed to config smc-pnet net device: %v, output: %v", err, string(output)) |
179 |
| - } |
180 |
| - return nil |
181 |
| -} |
182 |
| - |
183 |
| -func GetERDMANumaNode(info *netlink.RdmaLink) (int64, error) { |
184 |
| - devNumaPath := path.Join("/sys/class/infiniband/", info.Attrs.Name, "device/numa_node") |
185 |
| - numaStr, err := os.ReadFile(devNumaPath) |
186 |
| - if err != nil { |
187 |
| - return -1, fmt.Errorf("failed to get numa node for %s: %v", info.Attrs.Name, err) |
188 |
| - } |
189 |
| - numaStr = bytes.Trim(numaStr, "\n") |
190 |
| - numa, err := strconv.Atoi(string(numaStr)) |
191 |
| - if err != nil { |
192 |
| - return -1, fmt.Errorf("failed to parse numa node for %s: %v", info.Attrs.Name, err) |
193 |
| - } |
194 |
| - if numa < 0 { |
195 |
| - numa = 0 |
196 |
| - } |
197 |
| - return int64(numa), nil |
198 |
| -} |
0 commit comments