@@ -44,7 +44,9 @@ var mu sync.Mutex
44
44
var GPUCapacityMap = map [string ]tfv1.Resource {}
45
45
46
46
type Strategy interface {
47
- Score (gpu * tfv1.GPU ) int
47
+ // When isForNode = true, indicates each GPU's node level score
48
+ // otherwise it's single GPU score inside one node
49
+ Score (gpu * tfv1.GPU , isForNode bool ) int
48
50
49
51
SelectGPUs (gpus []* tfv1.GPU , count uint ) ([]* tfv1.GPU , error )
50
52
}
@@ -59,13 +61,14 @@ func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData {
59
61
}
60
62
61
63
// NewStrategy creates a strategy based on the placement mode
62
- func NewStrategy (placementMode tfv1.PlacementMode , cfg * config.GPUFitConfig ) Strategy {
64
+ func NewStrategy (placementMode tfv1.PlacementMode , cfg * config.GPUFitConfig , nodeGpuStore map [ string ] map [ string ] * tfv1. GPU ) Strategy {
63
65
switch placementMode {
64
66
case tfv1 .PlacementModeLowLoadFirst :
65
- return LowLoadFirst {cfg : cfg }
67
+ return LowLoadFirst {cfg : cfg , nodeGpuStore : nodeGpuStore }
68
+ case tfv1 .PlacementModeCompactFirst :
69
+ return CompactFirst {cfg : cfg , nodeGpuStore : nodeGpuStore }
66
70
default :
67
- // CompactFirst is the default strategy
68
- return CompactFirst {cfg : cfg }
71
+ return NodeCompactGPULowLoad {cfg : cfg , nodeGpuStore : nodeGpuStore }
69
72
}
70
73
}
71
74
@@ -182,14 +185,16 @@ func (s *GpuAllocator) Filter(
182
185
filterRegistry = filterRegistry .With (filter .NewGPUModelFilter (req .GPUModel ))
183
186
}
184
187
185
- if req .Count > 1 {
186
- filterRegistry = filterRegistry .With (filter .NewSameNodeFilter (req .Count ))
187
- }
188
- // Add NodeAffinityFilter if specified
188
+ // NOTE: deprecated, use Kubernetes native spec template affinity way
189
189
if req .NodeAffinity != nil {
190
190
filterRegistry = filterRegistry .With (filter .NewNodeAffinityFilter (s .Client , req .NodeAffinity ))
191
191
}
192
192
193
+ // Same node filter must be applied at final step
194
+ if req .Count > 1 {
195
+ filterRegistry = filterRegistry .With (filter .NewSameNodeFilter (req .Count ))
196
+ }
197
+
193
198
// Apply the filters in sequence
194
199
filteredGPUs , filterDetails , err := filterRegistry .Apply (s .ctx , req .WorkloadNameNamespace , toFilterGPUs , isSimulateSchedule )
195
200
if err != nil {
@@ -245,7 +250,7 @@ func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU)
245
250
246
251
strategy := NewStrategy (schedulingConfigTemplate .Spec .Placement .Mode , & config.GPUFitConfig {
247
252
MaxWorkerPerNode : s .maxWorkerPerNode ,
248
- })
253
+ }, s . nodeGpuStore )
249
254
selectedGPUs , err := strategy .SelectGPUs (filteredGPUs , req .Count )
250
255
if err != nil {
251
256
return nil , fmt .Errorf ("select GPU: %w" , err )
@@ -670,18 +675,20 @@ type scoredGPU struct {
670
675
score int
671
676
}
672
677
678
+ func (s * GpuAllocator ) GetScoringStrategy (cfg * config.GPUFitConfig , req * tfv1.AllocRequest ) Strategy {
679
+ return NewStrategy (s .getPlacementMode (s .ctx , req .PoolName ), cfg , s .nodeGpuStore )
680
+ }
681
+
673
682
// First level is k8s node name, second level is GPU name, value is score
674
683
func (s * GpuAllocator ) Score (
675
- ctx context.Context , cfg * config. GPUFitConfig , req * tfv1.AllocRequest , nodeGPUs map [string ][]* tfv1.GPU ,
684
+ ctx context.Context , strategy Strategy , req * tfv1.AllocRequest , nodeGPUs map [string ][]* tfv1.GPU ,
676
685
) map [string ]map [string ]int {
677
686
result := make (map [string ]map [string ]int , len (nodeGPUs ))
678
- strategy := NewStrategy (s .getPlacementMode (ctx , req .PoolName ), cfg )
679
-
680
687
allScores := make ([]scoredGPU , 0 , len (nodeGPUs ))
681
688
682
689
for nodeName , gpus := range nodeGPUs {
683
690
for _ , gpu := range gpus {
684
- res := strategy .Score (gpu )
691
+ res := strategy .Score (gpu , true )
685
692
686
693
// making Pending GPU to lower score, prefer not scheduling to them
687
694
if gpu .Status .Phase == tfv1 .TensorFusionGPUPhasePending {
@@ -1477,18 +1484,18 @@ func (s *GpuAllocator) getPlacementMode(ctx context.Context, poolName string) tf
1477
1484
pool := & tfv1.GPUPool {}
1478
1485
if err := s .Get (ctx , client.ObjectKey {Name : poolName }, pool ); err != nil {
1479
1486
// if failed to get pool, default to compact first
1480
- return tfv1 .PlacementModeCompactFirst
1487
+ return tfv1 .PlacementModeNodeCompactGPULowLoad
1481
1488
}
1482
1489
1483
1490
if pool .Spec .SchedulingConfigTemplate == nil || * pool .Spec .SchedulingConfigTemplate == "" {
1484
- return tfv1 .PlacementModeCompactFirst
1491
+ return tfv1 .PlacementModeNodeCompactGPULowLoad
1485
1492
}
1486
1493
1487
1494
// get scheduling config template
1488
1495
schedulingConfigTemplate := & tfv1.SchedulingConfigTemplate {}
1489
1496
if err := s .Get (ctx , client.ObjectKey {Name : * pool .Spec .SchedulingConfigTemplate }, schedulingConfigTemplate ); err != nil {
1490
1497
// if failed to get scheduling config template, default to compact first
1491
- return tfv1 .PlacementModeCompactFirst
1498
+ return tfv1 .PlacementModeNodeCompactGPULowLoad
1492
1499
}
1493
1500
return schedulingConfigTemplate .Spec .Placement .Mode
1494
1501
}
0 commit comments