Skip to content

Commit 331e4bb

Browse files
committed
fix: improve worker and pool metrics
1 parent 1eeb306 commit 331e4bb

File tree

3 files changed

+21
-1
lines changed

3 files changed

+21
-1
lines changed

internal/controller/gpunode_controller.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
158158
if err != nil {
159159
return fmt.Errorf("failed to update GPU node status to pending: %w", err)
160160
}
161+
metrics.SetNodeMetrics(node, poolObj, nil)
161162
}
162163

163164
err := r.syncStatusToGPUDevices(ctx, node, tfv1.TensorFusionGPUPhasePending)
@@ -172,7 +173,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
172173
return err
173174
}
174175
if len(gpuModels) == 0 {
175-
// when GPU created, will trigger next reconcile
176+
log.FromContext(ctx).Info("GPU models not found, skip update", "node", node.Name)
176177
return nil
177178
}
178179

@@ -290,6 +291,16 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
290291
}
291292
}
292293

294+
if job.Status.Failed > 0 {
295+
log.Info("node discovery job failed, update GPU node status to failed", "node", gpunode.Name)
296+
// Update phase to failed, require manual address why it failed and restart of node discovery job
297+
gpunode.Status.Phase = tfv1.TensorFusionGPUNodePhaseFailed
298+
if err := r.Status().Update(ctx, gpunode); err != nil {
299+
return fmt.Errorf("failed to update GPU node status to failed: %w", err)
300+
}
301+
metrics.SetNodeMetrics(gpunode, pool, nil)
302+
}
303+
293304
return nil
294305
}
295306

internal/metrics/recorder.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
9393
metricsItem.TflopsLimit = gpuLimitResource.Tflops.AsApproximateFloat64()
9494
metricsItem.VramBytesRequest = gpuRequestResource.Vram.AsApproximateFloat64()
9595
metricsItem.VramBytesLimit = gpuLimitResource.Vram.AsApproximateFloat64()
96+
metricsItem.Ready = utils.IsPodConditionTrue(pod.Status.Conditions, corev1.PodReady)
9697
if count <= 0 || count > uint64(math.MaxInt32) {
9798
// handle invalid or out-of-bounds data
9899
metricsItem.GPUCount = 1
@@ -116,6 +117,7 @@ func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []strin
116117
// Fields that possibly change after initialization
117118
metricsItem := nodeMetricsMap[node.Name]
118119
metricsItem.PoolName = poolObj.Name
120+
metricsItem.Phase = string(node.Status.Phase)
119121
metricsItem.SetGPUModelAndCount(gpuModels)
120122

121123
totalTflops := node.Status.TotalTFlops.AsApproximateFloat64()
@@ -279,6 +281,7 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) {
279281
enc.AddField("raw_cost", metrics.RawCost)
280282
enc.AddField("vram_bytes_limit", metrics.VramBytesLimit)
281283
enc.AddField("vram_bytes_request", metrics.VramBytesRequest)
284+
enc.AddField("ready", metrics.Ready)
282285

283286
enc.EndLine(now)
284287
}
@@ -302,6 +305,7 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) {
302305

303306
enc.AddTag("node", metrics.NodeName)
304307
enc.AddTag("pool", metrics.PoolName)
308+
enc.AddTag("phase", metrics.Phase)
305309

306310
enc.AddField("allocated_tflops", metrics.AllocatedTflops)
307311
enc.AddField("allocated_tflops_percent", metrics.AllocatedTflopsPercent)

internal/metrics/types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type WorkerResourceMetrics struct {
4242
VramBytesLimit float64 `json:"vramBytesLimit" gorm:"column:vram_bytes_limit"`
4343
GPUCount int `json:"gpuCount" gorm:"column:gpu_count"`
4444
RawCost float64 `json:"rawCost" gorm:"column:raw_cost"`
45+
Ready bool `json:"ready" gorm:"column:ready"`
4546

4647
// NOTE: make sure new fields will be migrated in SetupTable function
4748

@@ -60,6 +61,7 @@ func (wm WorkerResourceMetrics) TableName() string {
6061
type NodeResourceMetrics struct {
6162
NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
6263
PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
64+
Phase string `json:"phase" gorm:"column:phase;index:,class:INVERTED"`
6365

6466
AllocatedTflops float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"`
6567
AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"`
@@ -86,6 +88,9 @@ func (nm NodeResourceMetrics) TableName() string {
8688
}
8789

8890
func (nm *NodeResourceMetrics) SetGPUModelAndCount(gpuModels []string) {
91+
if gpuModels == nil {
92+
return
93+
}
8994
nm.gpuModels = gpuModels
9095
nm.GPUCount = len(gpuModels)
9196
}

0 commit comments

Comments
 (0)