Skip to content

Commit 559aff2

Browse files
authored
fix: optimize default placement and scoring, improve k8s version compatibility, karpenter label/annotation missed issue (#384)
* fix: lint issue * fix: add node compact gpu low load first mode, refactor test scoring * fix: default placement mode issue * fix: use ld.so.conf.d for ld lib path for client/ngpu, add apparmor config for hypervisor * fix: karpenter label missing issue
1 parent bf4a326 commit 559aff2

16 files changed

+703
-662
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
"nodeclassref",
111111
"noderesources",
112112
"nolint",
113+
"NUMA",
113114
"Nvlink",
114115
"NVML",
115116
"objs",

api/v1/schedulingconfigtemplate_types.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type SchedulingConfigTemplateSpec struct {
4242
}
4343

4444
type PlacementConfig struct {
45-
// +kubebuilder:default=CompactFirst
45+
// +kubebuilder:default=NodeCompactGPULowLoad
4646
Mode PlacementMode `json:"mode"`
4747

4848
// +kubebuilder:default=true
@@ -53,7 +53,7 @@ type PlacementConfig struct {
5353
GPUFilters []GPUFilter `json:"gpuFilters,omitempty"`
5454
}
5555

56-
// +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst
56+
// +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst;NodeCompactGPULowLoad
5757
type PlacementMode string
5858

5959
const (
@@ -62,6 +62,9 @@ const (
6262

6363
// in some cases, use lowLoadFirst for balance and fairness
6464
PlacementModeLowLoadFirst PlacementMode = "LowLoadFirst"
65+
66+
// in some cases, use nodeCompactGPULowLoad for balance and fairness
67+
PlacementModeNodeCompactGPULowLoad PlacementMode = "NodeCompactGPULowLoad"
6568
)
6669

6770
// GPUFilter is to select eligible GPUs for scheduling.

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,11 @@ spec:
277277
type: object
278278
type: array
279279
mode:
280-
default: CompactFirst
280+
default: NodeCompactGPULowLoad
281281
enum:
282282
- CompactFirst
283283
- LowLoadFirst
284+
- NodeCompactGPULowLoad
284285
type: string
285286
required:
286287
- mode

cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func init() {
109109
}
110110
karpenterScheme.Register(&karpv1.NodeClaim{}, &karpv1.NodeClaimList{})
111111
karpenterScheme.Register(&karpv1.NodePool{}, &karpv1.NodePoolList{})
112-
karpenterScheme.AddToScheme(scheme)
112+
utilruntime.Must(karpenterScheme.AddToScheme(scheme))
113113
}
114114

115115
//nolint:gocyclo

config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,11 @@ spec:
277277
type: object
278278
type: array
279279
mode:
280-
default: CompactFirst
280+
default: NodeCompactGPULowLoad
281281
enum:
282282
- CompactFirst
283283
- LowLoadFirst
284+
- NodeCompactGPULowLoad
284285
type: string
285286
required:
286287
- mode

internal/constants/env.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,16 @@ const (
6767
RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1"
6868
RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so"
6969

70-
PrependPathEnv = "TF_PREPEND_PATH"
71-
PrependLDLibraryPathEnv = "TF_PREPEND_LD_LIBRARY_PATH"
70+
PrependPathEnv = "TF_PREPEND_PATH"
71+
72+
RunInsideGPUEnv = "RUN_INSIDE_GPU_NODE"
7273

7374
LdPreloadFileName = "ld.so.preload"
7475
LdPreloadFile = "/etc/ld.so.preload"
7576

77+
LdLibraryPathFileName = "zz_tensor-fusion.conf"
78+
LdLibraryPathFile = "/etc/ld.so.conf.d/zz_tensor-fusion.conf"
79+
7680
TFLibsVolumeName = "tf-libs"
7781
TFLibsVolumeMountPath = "/tensor-fusion"
7882
TFConnectionNamePrefix = "-tf-vgpu-"

internal/gpuallocator/filter/node_filter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func NewSameNodeFilter(count uint) *SameNodeFilter {
2525

2626
// Filter implements GPUFilter.Filter
2727
// It groups GPUs by node and returns only those nodes that have at least 'count' GPUs
28+
// Must run at last step, otherwise some nodes and gpus passed may not valid at following steps
2829
func (f *SameNodeFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) {
2930
// If count is 1 or 0, no need to filter by node
3031
if f.count <= 1 {

internal/gpuallocator/gpuallocator.go

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ var mu sync.Mutex
4444
var GPUCapacityMap = map[string]tfv1.Resource{}
4545

4646
type Strategy interface {
47-
Score(gpu *tfv1.GPU) int
47+
// When isForNode = true, indicates each GPU's node level score
48+
// otherwise it's single GPU score inside one node
49+
Score(gpu *tfv1.GPU, isForNode bool) int
4850

4951
SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
5052
}
@@ -59,13 +61,14 @@ func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData {
5961
}
6062

6163
// NewStrategy creates a strategy based on the placement mode
62-
func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig) Strategy {
64+
func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, nodeGpuStore map[string]map[string]*tfv1.GPU) Strategy {
6365
switch placementMode {
6466
case tfv1.PlacementModeLowLoadFirst:
65-
return LowLoadFirst{cfg: cfg}
67+
return LowLoadFirst{cfg: cfg, nodeGpuStore: nodeGpuStore}
68+
case tfv1.PlacementModeCompactFirst:
69+
return CompactFirst{cfg: cfg, nodeGpuStore: nodeGpuStore}
6670
default:
67-
// CompactFirst is the default strategy
68-
return CompactFirst{cfg: cfg}
71+
return NodeCompactGPULowLoad{cfg: cfg, nodeGpuStore: nodeGpuStore}
6972
}
7073
}
7174

@@ -182,14 +185,16 @@ func (s *GpuAllocator) Filter(
182185
filterRegistry = filterRegistry.With(filter.NewGPUModelFilter(req.GPUModel))
183186
}
184187

185-
if req.Count > 1 {
186-
filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count))
187-
}
188-
// Add NodeAffinityFilter if specified
188+
// NOTE: deprecated, use Kubernetes native spec template affinity way
189189
if req.NodeAffinity != nil {
190190
filterRegistry = filterRegistry.With(filter.NewNodeAffinityFilter(s.Client, req.NodeAffinity))
191191
}
192192

193+
// Same node filter must be applied at final step
194+
if req.Count > 1 {
195+
filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count))
196+
}
197+
193198
// Apply the filters in sequence
194199
filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, isSimulateSchedule)
195200
if err != nil {
@@ -245,7 +250,7 @@ func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU)
245250

246251
strategy := NewStrategy(schedulingConfigTemplate.Spec.Placement.Mode, &config.GPUFitConfig{
247252
MaxWorkerPerNode: s.maxWorkerPerNode,
248-
})
253+
}, s.nodeGpuStore)
249254
selectedGPUs, err := strategy.SelectGPUs(filteredGPUs, req.Count)
250255
if err != nil {
251256
return nil, fmt.Errorf("select GPU: %w", err)
@@ -670,18 +675,20 @@ type scoredGPU struct {
670675
score int
671676
}
672677

678+
func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy {
679+
return NewStrategy(s.getPlacementMode(s.ctx, req.PoolName), cfg, s.nodeGpuStore)
680+
}
681+
673682
// First level is k8s node name, second level is GPU name, value is score
674683
func (s *GpuAllocator) Score(
675-
ctx context.Context, cfg *config.GPUFitConfig, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU,
684+
ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU,
676685
) map[string]map[string]int {
677686
result := make(map[string]map[string]int, len(nodeGPUs))
678-
strategy := NewStrategy(s.getPlacementMode(ctx, req.PoolName), cfg)
679-
680687
allScores := make([]scoredGPU, 0, len(nodeGPUs))
681688

682689
for nodeName, gpus := range nodeGPUs {
683690
for _, gpu := range gpus {
684-
res := strategy.Score(gpu)
691+
res := strategy.Score(gpu, true)
685692

686693
// making Pending GPU to lower score, prefer not scheduling to them
687694
if gpu.Status.Phase == tfv1.TensorFusionGPUPhasePending {
@@ -1477,18 +1484,18 @@ func (s *GpuAllocator) getPlacementMode(ctx context.Context, poolName string) tf
14771484
pool := &tfv1.GPUPool{}
14781485
if err := s.Get(ctx, client.ObjectKey{Name: poolName}, pool); err != nil {
14791486
// if failed to get pool, default to compact first
1480-
return tfv1.PlacementModeCompactFirst
1487+
return tfv1.PlacementModeNodeCompactGPULowLoad
14811488
}
14821489

14831490
if pool.Spec.SchedulingConfigTemplate == nil || *pool.Spec.SchedulingConfigTemplate == "" {
1484-
return tfv1.PlacementModeCompactFirst
1491+
return tfv1.PlacementModeNodeCompactGPULowLoad
14851492
}
14861493

14871494
// get scheduling config template
14881495
schedulingConfigTemplate := &tfv1.SchedulingConfigTemplate{}
14891496
if err := s.Get(ctx, client.ObjectKey{Name: *pool.Spec.SchedulingConfigTemplate}, schedulingConfigTemplate); err != nil {
14901497
// if failed to get scheduling config template, default to compact first
1491-
return tfv1.PlacementModeCompactFirst
1498+
return tfv1.PlacementModeNodeCompactGPULowLoad
14921499
}
14931500
return schedulingConfigTemplate.Spec.Placement.Mode
14941501
}

0 commit comments

Comments
 (0)