fix: improve worker and pool metrics

Code2Life · Code2Life · commit 331e4bb70a5d · 2025-08-07T11:20:24.000+08:00
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -158,6 +158,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
 			if err != nil {
 				return fmt.Errorf("failed to update GPU node status to pending: %w", err)
 			}
+			metrics.SetNodeMetrics(node, poolObj, nil)
 		}
 
 		err := r.syncStatusToGPUDevices(ctx, node, tfv1.TensorFusionGPUPhasePending)
@@ -172,7 +173,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
 			return err
 		}
 		if len(gpuModels) == 0 {
-			// when GPU created, will trigger next reconcile
+			log.FromContext(ctx).Info("GPU models not found, skip update", "node", node.Name)
 			return nil
 		}
 
@@ -290,6 +291,16 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
 		}
 	}
 
+	if job.Status.Failed > 0 {
+		log.Info("node discovery job failed, update GPU node status to failed", "node", gpunode.Name)
+		// Update phase to failed, require manual address why it failed and restart of node discovery job
+		gpunode.Status.Phase = tfv1.TensorFusionGPUNodePhaseFailed
+		if err := r.Status().Update(ctx, gpunode); err != nil {
+			return fmt.Errorf("failed to update GPU node status to failed: %w", err)
+		}
+		metrics.SetNodeMetrics(gpunode, pool, nil)
+	}
+
 	return nil
 }
 
diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go
@@ -93,6 +93,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
 	metricsItem.TflopsLimit = gpuLimitResource.Tflops.AsApproximateFloat64()
 	metricsItem.VramBytesRequest = gpuRequestResource.Vram.AsApproximateFloat64()
 	metricsItem.VramBytesLimit = gpuLimitResource.Vram.AsApproximateFloat64()
+	metricsItem.Ready = utils.IsPodConditionTrue(pod.Status.Conditions, corev1.PodReady)
 	if count <= 0 || count > uint64(math.MaxInt32) {
 		// handle invalid or out-of-bounds data
 		metricsItem.GPUCount = 1
@@ -116,6 +117,7 @@ func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []strin
 	// Fields that possibly change after initialization
 	metricsItem := nodeMetricsMap[node.Name]
 	metricsItem.PoolName = poolObj.Name
+	metricsItem.Phase = string(node.Status.Phase)
 	metricsItem.SetGPUModelAndCount(gpuModels)
 
 	totalTflops := node.Status.TotalTFlops.AsApproximateFloat64()
@@ -279,6 +281,7 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) {
 		enc.AddField("raw_cost", metrics.RawCost)
 		enc.AddField("vram_bytes_limit", metrics.VramBytesLimit)
 		enc.AddField("vram_bytes_request", metrics.VramBytesRequest)
+		enc.AddField("ready", metrics.Ready)
 
 		enc.EndLine(now)
 	}
@@ -302,6 +305,7 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) {
 
 		enc.AddTag("node", metrics.NodeName)
 		enc.AddTag("pool", metrics.PoolName)
+		enc.AddTag("phase", metrics.Phase)
 
 		enc.AddField("allocated_tflops", metrics.AllocatedTflops)
 		enc.AddField("allocated_tflops_percent", metrics.AllocatedTflopsPercent)
diff --git a/internal/metrics/types.go b/internal/metrics/types.go
@@ -42,6 +42,7 @@ type WorkerResourceMetrics struct {
 	VramBytesLimit   float64 `json:"vramBytesLimit" gorm:"column:vram_bytes_limit"`
 	GPUCount         int     `json:"gpuCount" gorm:"column:gpu_count"`
 	RawCost          float64 `json:"rawCost" gorm:"column:raw_cost"`
+	Ready            bool    `json:"ready" gorm:"column:ready"`
 
 	// NOTE: make sure new fields will be migrated in SetupTable function
 
@@ -60,6 +61,7 @@ func (wm WorkerResourceMetrics) TableName() string {
 type NodeResourceMetrics struct {
 	NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
 	PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
+	Phase    string `json:"phase" gorm:"column:phase;index:,class:INVERTED"`
 
 	AllocatedTflops        float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"`
 	AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"`
@@ -86,6 +88,9 @@ func (nm NodeResourceMetrics) TableName() string {
 }
 
 func (nm *NodeResourceMetrics) SetGPUModelAndCount(gpuModels []string) {
+	if gpuModels == nil {
+		return
+	}
 	nm.gpuModels = gpuModels
 	nm.GPUCount = len(gpuModels)
 }

Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont`
`158`	`158`	`if err != nil {`
`159`	`159`	`return fmt.Errorf("failed to update GPU node status to pending: %w", err)`
`160`	`160`	`}`
	`161`	`+ metrics.SetNodeMetrics(node, poolObj, nil)`
`161`	`162`	`}`
`162`	`163`
`163`	`164`	`err := r.syncStatusToGPUDevices(ctx, node, tfv1.TensorFusionGPUPhasePending)`
`@@ -172,7 +173,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont`
`172`	`173`	`return err`
`173`	`174`	`}`
`174`	`175`	`if len(gpuModels) == 0 {`
`175`		`- // when GPU created, will trigger next reconcile`
	`176`	`+ log.FromContext(ctx).Info("GPU models not found, skip update", "node", node.Name)`
`176`	`177`	`return nil`
`177`	`178`	`}`
`178`	`179`
`@@ -290,6 +291,16 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(`
`290`	`291`	`}`
`291`	`292`	`}`
`292`	`293`
	`294`	`+ if job.Status.Failed > 0 {`
	`295`	`+ log.Info("node discovery job failed, update GPU node status to failed", "node", gpunode.Name)`
	`296`	`+ // Update phase to failed, require manual address why it failed and restart of node discovery job`
	`297`	`+ gpunode.Status.Phase = tfv1.TensorFusionGPUNodePhaseFailed`
	`298`	`+ if err := r.Status().Update(ctx, gpunode); err != nil {`
	`299`	`+ return fmt.Errorf("failed to update GPU node status to failed: %w", err)`
	`300`	`+ }`
	`301`	`+ metrics.SetNodeMetrics(gpunode, pool, nil)`
	`302`	`+ }`
	`303`	`+`
`293`	`304`	`return nil`
`294`	`305`	`}`
`295`	`306`