Skip to content

Commit d602677

Browse files
authored
Merge pull request #17 from honghai120135/main
Support simple mode to not use access key
2 parents 452eca1 + a890d6f commit d602677

File tree

7 files changed

+162
-9
lines changed

7 files changed

+162
-9
lines changed

cmd/agent/main.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,27 @@ func main() {
1919
preferDriver string
2020
allocAllDevices bool
2121
devicepluginPreStart bool
22+
localERIDiscovery bool
23+
exposedLocalERIs string
2224
)
2325
flag.StringVar(&preferDriver, "prefer-driver", "", "prefer driver")
2426
flag.BoolVar(&allocAllDevices, "allocate-all-devices", false,
2527
"allocate all erdma devices for resource request, true => alloc all, false => alloc devices based on numa")
2628
flag.BoolVar(&devicepluginPreStart, "deviceplugin-prestart-container", false,
2729
"use device plugin prestart container to config smc-r, enable it if not use webhook to inject initContainers")
30+
flag.BoolVar(&localERIDiscovery, "local-eri-discovery", false,
31+
"Only manager on-node eri resources without using OpenAPI and access key")
32+
flag.StringVar(&exposedLocalERIs, "exposed-local-eris", "",
33+
"allocate specific ERI from existing ERI to pods for each instance")
2834
flag.Parse()
2935

30-
eriAgent, err := agent.NewAgent(preferDriver, allocAllDevices, devicepluginPreStart)
36+
eriAgent, err := agent.NewAgent(
37+
preferDriver,
38+
allocAllDevices,
39+
devicepluginPreStart,
40+
localERIDiscovery,
41+
exposedLocalERIs,
42+
)
3143
if err != nil {
3244
panic(err)
3345
}

deploy/helm/templates/configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ data:
1717
"enableWebhook": {{ .Values.config.enableWebhook }},
1818
"smcInitImage": "{{ .Values.config.smcInitImage }}",
1919
"enableInitContainerInject": {{ .Values.config.enableInitContainerInject }},
20+
"localERIDiscovery": {{ .Values.config.localERIDiscovery }},
2021
"nodeSelector": {{ .Values.nodeSelector | toJson }}
2122
}

deploy/helm/templates/daemonset.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ spec:
2121
spec:
2222
hostPID: true
2323
hostNetwork: true
24-
{{- with .Values.imagePullSecrets }}
24+
{{- with .Values.agent.imagePullSecrets }}
2525
imagePullSecrets:
2626
{{- toYaml . | nindent 8 }}
2727
{{- end }}
@@ -44,6 +44,12 @@ spec:
4444
{{ if .Values.agent.allocateAllDevices }}
4545
- --allocate-all-devices
4646
{{ end }}
47+
{{ if .Values.config.localERIDiscovery }}
48+
- --local-eri-discovery
49+
{{ end }}
50+
{{ if .Values.agent.exposedLocalERIs }}
51+
- --exposed-local-eris={{ join "," .Values.agent.exposedLocalERIs }}
52+
{{ end }}
4753
{{ if not .Values.config.enableWebhook }}
4854
- --deviceplugin-prestart-container
4955
{{ end }}

deploy/helm/templates/deployment.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
{{- if not .Values.config.localERIDiscovery }}
2+
---
13
apiVersion: apps/v1
24
kind: Deployment
35
metadata:
@@ -64,3 +66,4 @@ spec:
6466
tolerations:
6567
{{- toYaml . | nindent 8 }}
6668
{{- end }}
69+
{{- end }}

deploy/helm/values.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# This is a YAML-formatted file.
33
# Declare variables to be passed into your templates.
44

5+
# controller will not be deployed if localERIDiscovery is set
56
controller:
67
replicaCount: 2
78
image:
@@ -24,6 +25,12 @@ agent:
2425
tag: "latest"
2526
preferDriver: ""
2627
allocateAllDevices: false
28+
# format:
29+
# expose specific eris for matched node: - <instance_id> <eri-0>/<eri-1>/...
30+
# expose specific eris for unmatched node: - i-* <eri-0>/<eri-1>/...
31+
# expose all eris for unmatched node: - i-* erdma_*
32+
exposedLocalERIs:
33+
- i-XXX erdma_0/erdma_1
2734
imagePullSecrets: []
2835
nameOverride: ""
2936
fullnameOverride: ""
@@ -57,6 +64,7 @@ config:
5764
enableWebhook: false
5865
enableInitContainerInject: true
5966
smcInitImage: ""
67+
localERIDiscovery: false
6068

6169
credentials:
6270
type: ""

internal/agent/agent.go

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@ import (
55
"os"
66
"os/signal"
77
"runtime"
8+
"strings"
89
"syscall"
910

1011
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/deviceplugin"
1112
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/drivers"
1213
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/k8s"
1314
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types"
15+
"github.com/samber/lo"
1416
ctrl "sigs.k8s.io/controller-runtime"
17+
18+
networkv1 "github.com/AliyunContainerService/alibabacloud-erdma-controller/api/v1"
1519
)
1620

1721
var (
@@ -23,6 +27,8 @@ type Agent struct {
2327
driver drivers.ERdmaDriver
2428
allocAllDevices bool
2529
devicepluginPreStart bool
30+
localERIDiscovery bool
31+
exposedLocalERIs []string
2632
}
2733

2834
func stackTriger() {
@@ -48,25 +54,56 @@ func stackTriger() {
4854
signal.Notify(sigchain, syscall.SIGUSR1)
4955
}
5056

51-
func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool) (*Agent, error) {
57+
func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool, localERIDiscovery bool, exposedLocalERIs string) (*Agent, error) {
5258
kubernetes, err := k8s.NewKubernetes()
5359
if err != nil {
5460
return nil, err
5561
}
62+
agentLog.Info("NewAgent: ", "localERIDiscovery", localERIDiscovery)
5663
return &Agent{
5764
kubernetes: kubernetes,
5865
driver: drivers.GetDriver(preferDriver),
5966
allocAllDevices: allocAllDevice,
6067
devicepluginPreStart: devicepluginPreStart,
68+
localERIDiscovery: localERIDiscovery,
69+
exposedLocalERIs: strings.Split(exposedLocalERIs, ","),
6170
}, nil
6271
}
6372

6473
func (a *Agent) Run() error {
6574
go stackTriger()
66-
// 1. wait related eri device
67-
eriInfos, err := a.kubernetes.WaitEriInfo()
68-
if err != nil {
69-
return err
75+
var err error
76+
var eriInfos *networkv1.ERdmaDevice
77+
var eri []*types.ERI
78+
if !a.localERIDiscovery {
79+
// 1. wait related eri device
80+
eriInfos, err = a.kubernetes.WaitEriInfo()
81+
if err != nil {
82+
return err
83+
}
84+
} else {
85+
if !(len(a.exposedLocalERIs) == 1 && a.exposedLocalERIs[0] == "") {
86+
a.allocAllDevices = true
87+
agentLog.Info("LocalERIDiscovery: enable expose ERIs, set allocAllDevices to true")
88+
}
89+
eri, err = drivers.SelectERIs(a.exposedLocalERIs)
90+
if err != nil {
91+
return fmt.Errorf("LocalERIDiscovery: select eri failed: %v", err)
92+
}
93+
eriInfos = &networkv1.ERdmaDevice{
94+
Spec: networkv1.ERdmaDeviceSpec{
95+
Devices: lo.Map(eri, func(item *types.ERI, index int) networkv1.DeviceInfo {
96+
return networkv1.DeviceInfo{
97+
InstanceID: item.InstanceID,
98+
MAC: item.MAC,
99+
IsPrimaryENI: item.IsPrimaryENI,
100+
ID: item.ID,
101+
NetworkCardIndex: item.CardIndex,
102+
QueuePair: item.QueuePair,
103+
}
104+
}),
105+
},
106+
}
70107
}
71108
agentLog.Info("eri info", "eriInfo", eriInfos, "driver", a.driver.Name())
72109
// 2. install eri driver

internal/drivers/utils_linux.go

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,52 @@ import (
1111
"os/exec"
1212
"path"
1313
"path/filepath"
14+
"regexp"
1415
"strconv"
1516
"strings"
1617

1718
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types"
19+
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/utils"
1820
"github.com/samber/lo"
1921
"github.com/vishvananda/netlink"
2022
)
2123

24+
func checkExpose(instanceID string, exposedLocalERIs []string, rdmaDevice string) (bool, error) {
25+
var unMatchExposeERIs []string
26+
isMatched := false
27+
if len(exposedLocalERIs) == 1 && exposedLocalERIs[0] == "" {
28+
return true, nil
29+
}
30+
pattern := `^(i-(?:\w+|\*))\s+((?:(?:\w+)(?:\/\w+)*))$`
31+
re := regexp.MustCompile(pattern)
32+
for _, exposeInfo := range exposedLocalERIs {
33+
if !re.MatchString(exposeInfo) {
34+
return false, fmt.Errorf("invalid format %s", exposeInfo)
35+
}
36+
id := strings.SplitN(exposeInfo, " ", 2)[0]
37+
if instanceID == id {
38+
isMatched = true
39+
exposeERIs := strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/")
40+
for _, dev := range exposeERIs {
41+
if dev == rdmaDevice {
42+
return true, nil
43+
}
44+
}
45+
}
46+
if id == "i-*" {
47+
unMatchExposeERIs = strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/")
48+
}
49+
}
50+
if !isMatched {
51+
driverLog.Info("no matched instanceID found, using unMatchExposeERIs", "instanceID", instanceID)
52+
for _, dev := range unMatchExposeERIs {
53+
if dev == "erdma_*" || dev == rdmaDevice {
54+
return true, nil
55+
}
56+
}
57+
}
58+
return false, nil
59+
}
2260
func driverExists() bool {
2361
if isContainerOS() {
2462
_, err := containerExec("modinfo erdma")
@@ -104,14 +142,16 @@ func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) {
104142
}
105143
linkHwAddr := link.Attrs().HardwareAddr
106144
// erdma guid first byte is ^= 0x2
107-
linkHwAddr[0] ^= 0x2
145+
new_linkHwAddr := make(net.HardwareAddr, len(linkHwAddr))
146+
copy(new_linkHwAddr, linkHwAddr)
147+
new_linkHwAddr[0] ^= 0x2
108148
for _, rl := range rdmaLinks {
109149
rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid)
110150
if err != nil {
111151
return nil, err
112152
}
113153
driverLog.Info("check rdma link", "rdmaLink", rl.Attrs.Name, "rdmaHwAddr", rdmaHwAddr.String(), "linkHwAddr", linkHwAddr.String())
114-
if rdmaHwAddr.String() == linkHwAddr.String() {
154+
if rdmaHwAddr.String() == new_linkHwAddr.String() {
115155
return rl, nil
116156
}
117157
}
@@ -203,3 +243,49 @@ func GetERDMANumaNode(info *netlink.RdmaLink) (int64, error) {
203243
}
204244
return int64(numa), nil
205245
}
246+
247+
const (
248+
instanceIDAddr = "http://100.100.100.200/latest/meta-data/instance-id"
249+
)
250+
251+
func SelectERIs(exposedLocalERIs []string) ([]*types.ERI, error) {
252+
var selectEriList []*types.ERI
253+
var isExposed bool
254+
instanceID, _ := utils.GetStrFromMetadata(instanceIDAddr)
255+
links, err := netlink.LinkList()
256+
if err != nil {
257+
return nil, fmt.Errorf("list link failed: %v", err)
258+
}
259+
260+
for _, link := range links {
261+
if _, ok := link.(*netlink.Device); !ok {
262+
continue
263+
}
264+
if link.Attrs().HardwareAddr != nil {
265+
rdmaLink, _ := GetERdmaFromLink(link)
266+
if rdmaLink != nil {
267+
rdmadevice := rdmaLink.Attrs.Name
268+
isExposed, err = checkExpose(instanceID, exposedLocalERIs, rdmadevice)
269+
if isExposed {
270+
driverLog.Info("LocalERIDiscovery: expose eri", "rdmadevice", rdmadevice, "link name", link.Attrs().Name)
271+
eri := &types.ERI{
272+
ID: rdmadevice,
273+
IsPrimaryENI: link.Attrs().Name == "eth0",
274+
MAC: link.Attrs().HardwareAddr.String(),
275+
InstanceID: instanceID,
276+
CardIndex: -1,
277+
QueuePair: -1,
278+
}
279+
selectEriList = append(selectEriList, eri)
280+
driverLog.Info("Simple mode SelectERIs: eri", "eri", eri)
281+
} else if err != nil {
282+
return nil, err
283+
}
284+
} else {
285+
driverLog.Info("LocalERIDiscovery: link is not rdma device, skip", "link_name", link.Attrs().Name)
286+
}
287+
}
288+
}
289+
290+
return selectEriList, nil
291+
}

0 commit comments

Comments
 (0)