Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pkg/gpu/nvidia/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ import (
type MemoryUnit string

const (
resourceName = "aliyun.com/gpu-mem"
resourceCount = "aliyun.com/gpu-count"
serverSock = pluginapi.DevicePluginPath + "aliyungpushare.sock"
resourceName = "aliyun.com/gpu-mem"
resourceCount = "aliyun.com/gpu-count"
resourceStatus = "aliyun.com/gpu-dev-"
serverSock = pluginapi.DevicePluginPath + "aliyungpushare.sock"

OptimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"

Expand Down
17 changes: 16 additions & 1 deletion pkg/gpu/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func deviceExists(devs []*pluginapi.Device, id string) bool {
return false
}

func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device, devNameMap map[string]uint) {
eventSet := nvml.NewEventSet()
defer nvml.DeleteEventSet(eventSet)

Expand Down Expand Up @@ -135,15 +135,30 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi
continue
}

var devIdArray []uint
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
for _, d := range devs {
xids <- d
}

for _, v := range devNameMap {
devIdArray = append(devIdArray, v)
}

patchGPUUnhealthyStatus(devIdArray)
continue
}

val, ok := devNameMap[*e.UUID]
if ok {
patchGPUUnhealthyStatus(append(devIdArray, val))
} else {
log.Errorf("Unknown UUID %s", *e.UUID)
}

for _, d := range devs {

if extractRealDeviceID(d.ID) == *e.UUID {
xids <- d
}
Expand Down
25 changes: 25 additions & 0 deletions pkg/gpu/nvidia/podmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ func patchGPUCount(gpuCount int) error {
newNode := node.DeepCopy()
newNode.Status.Capacity[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)
newNode.Status.Allocatable[resourceCount] = *resource.NewQuantity(int64(gpuCount), resource.DecimalSI)

for i := 0; i < gpuCount; i++ {
newNode.Status.Allocatable[v1.ResourceName(fmt.Sprintf("%s%d", resourceStatus, i))] =
*resource.NewQuantity(1, resource.DecimalSI)
}
// content := fmt.Sprintf(`[{"op": "add", "path": "/status/capacity/aliyun.com~gpu-count", "value": "%d"}]`, gpuCount)
// _, err = clientset.CoreV1().Nodes().PatchStatus(nodeName, []byte(content))
_, _, err = nodeutil.PatchNodeStatus(clientset.CoreV1(), types.NodeName(nodeName), node, newNode)
Expand All @@ -84,6 +89,26 @@ func patchGPUCount(gpuCount int) error {
return err
}

func patchGPUUnhealthyStatus(array []uint) error {

node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if err != nil {
return err
}
newNode := node.DeepCopy()
for _, val := range array {
newNode.Status.Allocatable[v1.ResourceName(fmt.Sprintf("%s%d", resourceStatus, val))] =
*resource.NewQuantity(0, resource.DecimalSI)
}
_, _, err = nodeutil.PatchNodeStatus(clientset.CoreV1(), types.NodeName(nodeName), node, newNode)
if err != nil {
log.Infof("Failed to update GPU Card %s.", resourceStatus)
} else {
log.Infof("Updated Capacity %s successfully.", resourceStatus)
}
return err
}

func getPendingPodsInNode() ([]v1.Pod, error) {
// pods, err := m.lister.List(labels.Everything())
// if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/gpu/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func (m *NvidiaDevicePlugin) healthcheck() {
var xids chan *pluginapi.Device
if m.healthCheck {
xids = make(chan *pluginapi.Device)
go watchXIDs(ctx, m.devs, xids)
go watchXIDs(ctx, m.devs, xids, m.devNameMap)
}

for {
Expand Down