Skip to content

Commit ab0fece

Browse files
committed
optimise exist Eri take over
Signed-off-by: bingshen.wbs <[email protected]>
1 parent ffb3cd1 commit ab0fece

File tree

3 files changed

+691
-34
lines changed

3 files changed

+691
-34
lines changed

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99
github.com/onsi/ginkgo/v2 v2.19.0
1010
github.com/onsi/gomega v1.33.1
1111
github.com/samber/lo v1.47.0
12+
github.com/stretchr/testify v1.9.0
1213
github.com/vishvananda/netlink v1.3.0
1314
golang.org/x/net v0.26.0
1415
google.golang.org/grpc v1.65.0
@@ -23,6 +24,7 @@ require (
2324

2425
require (
2526
github.com/morikuni/aec v1.0.0 // indirect
27+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
2628
gotest.tools/v3 v3.5.1 // indirect
2729
)
2830

internal/controller/eri.go

Lines changed: 86 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controller
22

33
import (
44
"fmt"
5+
"os"
56
"strings"
67

78
networkv1 "github.com/AliyunContainerService/alibabacloud-erdma-controller/api/v1"
@@ -32,16 +33,20 @@ const (
3233
type EriClient struct {
3334
client *ecs.Client
3435
regionID string
35-
managedNonOwned bool
36+
ManagedNonOwned bool
3637
}
3738

3839
func NewEriClient(k8sClient client.Client) (*EriClient, error) {
3940
cred, err := getCredential(k8sClient)
4041
if err != nil {
4142
return nil, err
4243
}
44+
network := "tcp"
45+
if os.Getenv("PUBLIC_NETWORK") == "true" {
46+
network = "public"
47+
}
4348

44-
ecsEndpoint, err := service.GetEndpointRules(tea.String("ecs"), tea.String(config.GetConfig().Region), tea.String("regional"), tea.String("vpc"), nil)
49+
ecsEndpoint, err := service.GetEndpointRules(tea.String("ecs"), tea.String(config.GetConfig().Region), tea.String("regional"), tea.String(network), nil)
4550
if err != nil {
4651
return nil, err
4752
}
@@ -50,15 +55,15 @@ func NewEriClient(k8sClient client.Client) (*EriClient, error) {
5055
UserAgent: ptr.To("AlibabaCloud/ERdma-Controller/0.1"),
5156
Credential: cred,
5257
EndpointType: tea.String("regional"),
53-
Network: tea.String("vpc"),
58+
Network: tea.String(network),
5459
Endpoint: ecsEndpoint,
5560
})
5661
if err != nil {
5762
return nil, err
5863
}
5964
return &EriClient{
6065
regionID: config.GetConfig().Region,
61-
managedNonOwned: config.GetConfig().ManageNonOwnedERIs,
66+
ManagedNonOwned: config.GetConfig().ManageNonOwnedERIs,
6267
client: client,
6368
}, nil
6469
}
@@ -173,7 +178,7 @@ func (e *EriClient) ConvertPrimaryENI(primaryENI string, queuePair int) error {
173178
NetworkInterfaceTrafficConfig: &ecs.ModifyNetworkInterfaceAttributeRequestNetworkInterfaceTrafficConfig{
174179
NetworkInterfaceTrafficMode: ptr.To(trafficModeRDMA),
175180
// todo: not support dynamic set queue pair number
176-
// QueuePairNumber: ptr.To(int32(queuePair)),
181+
QueuePairNumber: ptr.To(int32(queuePair)),
177182
},
178183
}); err != nil {
179184
return err
@@ -218,55 +223,102 @@ func (e *EriClient) SelectERIs(instanceID string) ([]*types.ERI, error) {
218223
cardCount = int(min(*instanceType.NetworkCardQuantity, eriQuantity))
219224
}
220225
queuePairCount = int(*instanceType.QueuePairNumber)
226+
// GPU instance max queue pair number is card count * queue pair number
227+
if instanceType.GPUAmount != nil && *instanceType.GPUAmount > 0 {
228+
queuePairCount = int(*instanceType.QueuePairNumber * int32(cardCount))
229+
}
221230
}
222231

223-
existENIs, err := e.client.DescribeNetworkInterfaces(&ecs.DescribeNetworkInterfacesRequest{
232+
describeENIResponse, err := e.client.DescribeNetworkInterfaces(&ecs.DescribeNetworkInterfacesRequest{
224233
RegionId: ptr.To(e.regionID),
225234
InstanceId: ptr.To(instanceID),
226235
PageSize: ptr.To(int32(100)),
227236
})
228237
if err != nil {
229238
return nil, fmt.Errorf("cannot found node eni: %v", err)
230239
}
240+
existENIs := describeENIResponse.Body.NetworkInterfaceSets.NetworkInterfaceSet
241+
selectEriList, needCreate, queuePairNumberConfig, err := e.SelectEriFromExist(existENIs, queuePairCount, cardCount)
242+
if err != nil {
243+
return nil, fmt.Errorf("cannot generate eri config list from exist enis: %v", err)
244+
}
245+
eris, err := e.CreateEriForInstance(instanceResp.Body.Instances.Instance[0], needCreate, queuePairNumberConfig)
246+
if err != nil {
247+
return nil, err
248+
}
249+
selectEriList = append(selectEriList, eris...)
250+
251+
return selectEriList, nil
252+
}
253+
254+
func (e *EriClient) SelectEriFromExist(existENIs []*ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet, queuePairCount, cardCount int) ([]*types.ERI, []int, int, error) {
255+
var existQueuePairCount int
256+
existERIs := lo.Filter(existENIs, func(item *ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet, _ int) bool {
257+
eri := item.NetworkInterfaceTrafficMode != nil && *item.NetworkInterfaceTrafficMode == trafficModeRDMA
258+
if eri {
259+
existQueuePairCount += int(*item.QueuePairNumber)
260+
}
261+
return eri
262+
})
263+
eriLog.Info("exist eri", "existERIs", lo.Map(existERIs, func(item *ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet, _ int) *types.ERI {
264+
return toEri(item, 0)
265+
}), "existQueuePairCount", existQueuePairCount, "osMaxQueuePairCount", queuePairCount, "cardCount", cardCount)
266+
231267
var (
232268
selectedENIs []*ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet
233269
cardIndexENI = map[int]*types.ERI{}
234270
)
235-
for _, eni := range existENIs.Body.NetworkInterfaceSets.NetworkInterfaceSet {
236-
if eni.Type != nil && *eni.Type == "Primary" {
237-
selectedENIs = append(selectedENIs, eni)
238-
cardIndexENI[0] = toEri(eni, queuePairCount/cardCount)
239-
} else {
240-
if eni.NetworkInterfaceTrafficMode != nil && *eni.NetworkInterfaceTrafficMode == trafficModeRDMA && e.OwnENI(eni) {
241-
eniIndex := eniCardIndex(eni)
242-
if _, ok := cardIndexENI[eniIndex]; !ok {
243-
cardIndexENI[eniIndex] = toEri(eni, queuePairCount/cardCount)
244-
selectedENIs = append(selectedENIs, eni)
245-
}
271+
272+
for _, eri := range existERIs {
273+
if !e.OwnENI(eri) {
274+
continue
275+
}
276+
eniIndex := eniCardIndex(eri)
277+
if _, ok := cardIndexENI[eniIndex]; !ok {
278+
cardIndexENI[eniIndex] = toEri(eri, 0)
279+
selectedENIs = append(selectedENIs, eri)
280+
}
281+
}
282+
var needCreateOrConvert []int
283+
if existQueuePairCount <= queuePairCount {
284+
for i := 0; i < cardCount; i++ {
285+
if _, ok := cardIndexENI[i]; !ok {
286+
needCreateOrConvert = append(needCreateOrConvert, i)
246287
}
247288
}
248289
}
249-
if len(cardIndexENI) == 0 {
250-
return nil, fmt.Errorf("cannot found node primary eni")
290+
291+
var remainQueuePairCountPerCardIndex int
292+
if len(needCreateOrConvert) > 0 {
293+
remainQueuePairCountPerCardIndex = (queuePairCount - existQueuePairCount) / len(needCreateOrConvert)
294+
if remainQueuePairCountPerCardIndex > 0 {
295+
if _, ok := cardIndexENI[0]; !ok {
296+
// if cardIndex 0 not bind ENI, using primary ENI as cardIndex 0 ENI
297+
for _, eni := range existENIs {
298+
if eni.Type != nil && *eni.Type == "Primary" {
299+
selectedENIs = append(selectedENIs, eni)
300+
cardIndexENI[0] = toEri(eni, remainQueuePairCountPerCardIndex)
301+
cardIndex0Idx := lo.IndexOf(needCreateOrConvert, 0)
302+
// remove from create list
303+
needCreateOrConvert = append(needCreateOrConvert[:cardIndex0Idx], needCreateOrConvert[cardIndex0Idx+1:]...)
304+
}
305+
}
306+
}
307+
if len(cardIndexENI) == 0 {
308+
return nil, nil, 0, fmt.Errorf("cannot find node primary ENI or existing ENI")
309+
}
310+
} else {
311+
needCreateOrConvert = nil
312+
}
251313
}
252314

253315
eriList := lo.Map(selectedENIs, func(item *ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet, _ int) *types.ERI {
254-
return toEri(item, queuePairCount/cardCount)
316+
return toEri(item, remainQueuePairCountPerCardIndex)
255317
})
256-
257-
var needCreate []int
258-
for i := 0; i < cardCount; i++ {
259-
if _, ok := cardIndexENI[i]; !ok {
260-
needCreate = append(needCreate, i)
261-
}
318+
if len(eriList) == 0 && len(needCreateOrConvert) == 0 {
319+
return nil, nil, 0, fmt.Errorf("cannot create ERI for instance due to no available slot")
262320
}
263-
eris, err := e.CreateEriForInstance(instanceResp.Body.Instances.Instance[0], needCreate, queuePairCount/cardCount)
264-
if err != nil {
265-
return nil, err
266-
}
267-
eriList = append(eriList, eris...)
268-
269-
return eriList, nil
321+
return eriList, needCreateOrConvert, remainQueuePairCountPerCardIndex, nil
270322
}
271323

272324
func (e *EriClient) EnsureEriForInstance(devices []networkv1.DeviceInfo) ([]networkv1.DeviceStatus, error) {
@@ -346,7 +398,7 @@ func (e *EriClient) EnsureEriForInstance(devices []networkv1.DeviceInfo) ([]netw
346398
}
347399

348400
func (e *EriClient) OwnENI(eni *ecs.DescribeNetworkInterfacesResponseBodyNetworkInterfaceSetsNetworkInterfaceSet) bool {
349-
if e.managedNonOwned {
401+
if e.ManagedNonOwned {
350402
return true
351403
}
352404
if eni.Tags == nil || eni.Tags.Tag == nil {

0 commit comments

Comments
 (0)