NexusGPU
diff --git a/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 5 additions & 2 deletions b/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 2 additions & 1 deletion b/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cmd/main.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/main.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 2 additions & 1 deletion b/‎config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎internal/constants/env.go‎
Lines changed: 6 additions & 2 deletions b/‎internal/constants/env.go‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎internal/gpuallocator/filter/node_filter.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/gpuallocator/filter/node_filter.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/gpuallocator/gpuallocator.go‎
Lines changed: 24 additions & 17 deletions b/‎internal/gpuallocator/gpuallocator.go‎
Lines changed: 24 additions & 17 deletions
@@ -110,6 +110,7 @@
         "nodeclassref",
         "noderesources",
         "nolint",
+        "NUMA",
         "Nvlink",
         "NVML",
         "objs",
 
@@ -42,7 +42,7 @@ type SchedulingConfigTemplateSpec struct {
 }
 
 type PlacementConfig struct {
-	// +kubebuilder:default=CompactFirst
+	// +kubebuilder:default=NodeCompactGPULowLoad
 	Mode PlacementMode `json:"mode"`
 
 	// +kubebuilder:default=true
@@ -53,7 +53,7 @@ type PlacementConfig struct {
 	GPUFilters []GPUFilter `json:"gpuFilters,omitempty"`
 }
 
-// +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst
+// +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst;NodeCompactGPULowLoad
 type PlacementMode string
 
 const (
@@ -62,6 +62,9 @@ const (
 
 	// in some cases, use lowLoadFirst for balance and fairness
 	PlacementModeLowLoadFirst PlacementMode = "LowLoadFirst"
+
+	// in some cases, use nodeCompactGPULowLoad for balance and fairness
+	PlacementModeNodeCompactGPULowLoad PlacementMode = "NodeCompactGPULowLoad"
 )
 
 // GPUFilter is to select eligible GPUs for scheduling.
 
@@ -277,10 +277,11 @@ spec:
                       type: object
                     type: array
                   mode:
-                    default: CompactFirst
+                    default: NodeCompactGPULowLoad
                     enum:
                     - CompactFirst
                     - LowLoadFirst
+                    - NodeCompactGPULowLoad
                     type: string
                 required:
                 - mode
 
@@ -109,7 +109,7 @@ func init() {
 	}
 	karpenterScheme.Register(&karpv1.NodeClaim{}, &karpv1.NodeClaimList{})
 	karpenterScheme.Register(&karpv1.NodePool{}, &karpv1.NodePoolList{})
-	karpenterScheme.AddToScheme(scheme)
+	utilruntime.Must(karpenterScheme.AddToScheme(scheme))
 }
 
 //nolint:gocyclo
 
@@ -277,10 +277,11 @@ spec:
                       type: object
                     type: array
                   mode:
-                    default: CompactFirst
+                    default: NodeCompactGPULowLoad
                     enum:
                     - CompactFirst
                     - LowLoadFirst
+                    - NodeCompactGPULowLoad
                     type: string
                 required:
                 - mode
 
@@ -67,12 +67,16 @@ const (
 	RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1"
 	RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so"
 
-	PrependPathEnv          = "TF_PREPEND_PATH"
-	PrependLDLibraryPathEnv = "TF_PREPEND_LD_LIBRARY_PATH"
+	PrependPathEnv = "TF_PREPEND_PATH"
+
+	RunInsideGPUEnv = "RUN_INSIDE_GPU_NODE"
 
 	LdPreloadFileName = "ld.so.preload"
 	LdPreloadFile     = "/etc/ld.so.preload"
 
+	LdLibraryPathFileName = "zz_tensor-fusion.conf"
+	LdLibraryPathFile     = "/etc/ld.so.conf.d/zz_tensor-fusion.conf"
+
 	TFLibsVolumeName         = "tf-libs"
 	TFLibsVolumeMountPath    = "/tensor-fusion"
 	TFConnectionNamePrefix   = "-tf-vgpu-"
 
@@ -25,6 +25,7 @@ func NewSameNodeFilter(count uint) *SameNodeFilter {
 
 // Filter implements GPUFilter.Filter
 // It groups GPUs by node and returns only those nodes that have at least 'count' GPUs
+// Must run at last step, otherwise some nodes and gpus passed may not valid at following steps
 func (f *SameNodeFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) {
 	// If count is 1 or 0, no need to filter by node
 	if f.count <= 1 {
 
@@ -44,7 +44,9 @@ var mu sync.Mutex
 var GPUCapacityMap = map[string]tfv1.Resource{}
 
 type Strategy interface {
-	Score(gpu *tfv1.GPU) int
+	// When isForNode = true, indicates each GPU's node level score
+	// otherwise it's single GPU score inside one node
+	Score(gpu *tfv1.GPU, isForNode bool) int
 
 	SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
 }
@@ -59,13 +61,14 @@ func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData {
 }
 
 // NewStrategy creates a strategy based on the placement mode
-func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig) Strategy {
+func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, nodeGpuStore map[string]map[string]*tfv1.GPU) Strategy {
 	switch placementMode {
 	case tfv1.PlacementModeLowLoadFirst:
-		return LowLoadFirst{cfg: cfg}
+		return LowLoadFirst{cfg: cfg, nodeGpuStore: nodeGpuStore}
+	case tfv1.PlacementModeCompactFirst:
+		return CompactFirst{cfg: cfg, nodeGpuStore: nodeGpuStore}
 	default:
-		// CompactFirst is the default strategy
-		return CompactFirst{cfg: cfg}
+		return NodeCompactGPULowLoad{cfg: cfg, nodeGpuStore: nodeGpuStore}
 	}
 }
 
@@ -182,14 +185,16 @@ func (s *GpuAllocator) Filter(
 		filterRegistry = filterRegistry.With(filter.NewGPUModelFilter(req.GPUModel))
 	}
 
-	if req.Count > 1 {
-		filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count))
-	}
-	// Add NodeAffinityFilter if specified
+	// NOTE: deprecated, use Kubernetes native spec template affinity way
 	if req.NodeAffinity != nil {
 		filterRegistry = filterRegistry.With(filter.NewNodeAffinityFilter(s.Client, req.NodeAffinity))
 	}
 
+	// Same node filter must be applied at final step
+	if req.Count > 1 {
+		filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count))
+	}
+
 	// Apply the filters in sequence
 	filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, isSimulateSchedule)
 	if err != nil {
@@ -245,7 +250,7 @@ func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU)
 
 	strategy := NewStrategy(schedulingConfigTemplate.Spec.Placement.Mode, &config.GPUFitConfig{
 		MaxWorkerPerNode: s.maxWorkerPerNode,
-	})
+	}, s.nodeGpuStore)
 	selectedGPUs, err := strategy.SelectGPUs(filteredGPUs, req.Count)
 	if err != nil {
 		return nil, fmt.Errorf("select GPU: %w", err)
@@ -670,18 +675,20 @@ type scoredGPU struct {
 	score    int
 }
 
+func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy {
+	return NewStrategy(s.getPlacementMode(s.ctx, req.PoolName), cfg, s.nodeGpuStore)
+}
+
 // First level is k8s node name, second level is GPU name, value is score
 func (s *GpuAllocator) Score(
-	ctx context.Context, cfg *config.GPUFitConfig, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU,
+	ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU,
 ) map[string]map[string]int {
 	result := make(map[string]map[string]int, len(nodeGPUs))
-	strategy := NewStrategy(s.getPlacementMode(ctx, req.PoolName), cfg)
-
 	allScores := make([]scoredGPU, 0, len(nodeGPUs))
 
 	for nodeName, gpus := range nodeGPUs {
 		for _, gpu := range gpus {
-			res := strategy.Score(gpu)
+			res := strategy.Score(gpu, true)
 
 			// making Pending GPU to lower score, prefer not scheduling to them
 			if gpu.Status.Phase == tfv1.TensorFusionGPUPhasePending {
@@ -1477,18 +1484,18 @@ func (s *GpuAllocator) getPlacementMode(ctx context.Context, poolName string) tf
 	pool := &tfv1.GPUPool{}
 	if err := s.Get(ctx, client.ObjectKey{Name: poolName}, pool); err != nil {
 		// if failed to get pool, default to compact first
-		return tfv1.PlacementModeCompactFirst
+		return tfv1.PlacementModeNodeCompactGPULowLoad
 	}
 
 	if pool.Spec.SchedulingConfigTemplate == nil || *pool.Spec.SchedulingConfigTemplate == "" {
-		return tfv1.PlacementModeCompactFirst
+		return tfv1.PlacementModeNodeCompactGPULowLoad
 	}
 
 	// get scheduling config template
 	schedulingConfigTemplate := &tfv1.SchedulingConfigTemplate{}
 	if err := s.Get(ctx, client.ObjectKey{Name: *pool.Spec.SchedulingConfigTemplate}, schedulingConfigTemplate); err != nil {
 		// if failed to get scheduling config template, default to compact first
-		return tfv1.PlacementModeCompactFirst
+		return tfv1.PlacementModeNodeCompactGPULowLoad
 	}
 	return schedulingConfigTemplate.Spec.Placement.Mode
 }
Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,7 @@ func init() {`
`109`	`109`	`}`
`110`	`110`	`karpenterScheme.Register(&karpv1.NodeClaim{}, &karpv1.NodeClaimList{})`
`111`	`111`	`karpenterScheme.Register(&karpv1.NodePool{}, &karpv1.NodePoolList{})`
`112`		`- karpenterScheme.AddToScheme(scheme)`
	`112`	`+ utilruntime.Must(karpenterScheme.AddToScheme(scheme))`
`113`	`113`	`}`
`114`	`114`
`115`	`115`	`//nolint:gocyclo`