diff --git a/cmd/nvidia/main.go b/cmd/nvidia/main.go index 05747a7d..a4db55ce 100644 --- a/cmd/nvidia/main.go +++ b/cmd/nvidia/main.go @@ -11,12 +11,13 @@ var ( mps = flag.Bool("mps", false, "Enable or Disable MPS") healthCheck = flag.Bool("health-check", false, "Enable or disable Health check") memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'") + mpspipe = flag.String("mps-pipe", "/tmp/nvidia-mps", " pipes and UNIX domain sockets") ) func main() { flag.Parse() log.V(1).Infoln("Start gpushare device plugin") - ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit)) + ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *mpspipe, translatememoryUnits(*memoryUnit)) err := ngm.Run() if err != nil { log.Fatalf("Failed due to %v", err) diff --git a/device-plugin-ds.yaml b/device-plugin-ds.yaml index 1c065131..06453778 100644 --- a/device-plugin-ds.yaml +++ b/device-plugin-ds.yaml @@ -25,6 +25,8 @@ spec: - gpushare-device-plugin-v2 - -logtostderr - --v=5 + # - --mps-pipe=/root/nvidia-mps // mps-client and mps-server communicate through the directory.You can modify it. + # - --mps=true //if you want to use mps - --memory-unit=GiB resources: limits: diff --git a/pkg/gpu/nvidia/allocate.go b/pkg/gpu/nvidia/allocate.go index a0c7aa26..d2c70ec1 100644 --- a/pkg/gpu/nvidia/allocate.go +++ b/pkg/gpu/nvidia/allocate.go @@ -21,18 +21,22 @@ func init() { kubeInit() } -func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse { +func (m *NvidiaDevicePlugin) buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse { responses := pluginapi.AllocateResponse{} for _, req := range reqs.ContainerRequests { response := pluginapi.ContainerAllocateResponse{ Envs: map[string]string{ - envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU), + envNVGPU: fmt.Sprintf("no-gpu-has-%dGiB-to-run", podReqGPU), EnvResourceIndex: fmt.Sprintf("-1"), EnvResourceByPod: fmt.Sprintf("%d", podReqGPU), EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))), EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()), }, } + if m.mps { + response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*uint(len(req.DevicesIDs))/getGPUMemory()) + response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe) + } responses.ContainerResponses = append(responses.ContainerResponses, &response) } return &responses @@ -62,7 +66,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, pods, err := getCandidatePods() if err != nil { log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err) - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } if log.V(4) { @@ -106,7 +110,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, } if id < 0 { - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } // 1. Create container requests @@ -121,6 +125,15 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()), }, } + if m.mps { + response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*reqGPU/getGPUMemory()) + response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe) + mount := pluginapi.Mount{ + ContainerPath: m.mpspipe, + HostPath: m.mpspipe, + } + response.Mounts = append(response.Mounts, &mount) + } responses.ContainerResponses = append(responses.ContainerResponses, &response) } @@ -134,17 +147,17 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{}) if err != nil { log.Warningf("Failed due to %v", err) - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } newPod = updatePodAnnotations(pod) _, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod) if err != nil { log.Warningf("Failed due to %v", err) - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } } else { log.Warningf("Failed due to %v", err) - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } } @@ -152,7 +165,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.", podReqGPU) // return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU) - return buildErrResponse(reqs, podReqGPU), nil + return m.buildErrResponse(reqs, podReqGPU), nil } log.Infof("new allocated GPUs info %v", &responses) diff --git a/pkg/gpu/nvidia/const.go b/pkg/gpu/nvidia/const.go index 97d9afdb..71d4d5f2 100644 --- a/pkg/gpu/nvidia/const.go +++ b/pkg/gpu/nvidia/const.go @@ -21,14 +21,16 @@ const ( containerLogPathLabelKey = "io.kubernetes.container.logpath" sandboxIDLabelKey = "io.kubernetes.sandbox.id" - envNVGPU = "NVIDIA_VISIBLE_DEVICES" - EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX" - EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD" - EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER" - EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV" - EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED" - EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME" - EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME" + envNVGPU = "NVIDIA_VISIBLE_DEVICES" + EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX" + EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD" + EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER" + EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV" + EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED" + EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME" + EnvMPSPipeDirectory = "CUDA_MPS_PIPE_DIRECTORY" + EnvMPSActiveThreadPercentage = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE" + EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME" GiBPrefix = MemoryUnit("GiB") MiBPrefix = MemoryUnit("MiB") diff --git a/pkg/gpu/nvidia/gpumanager.go b/pkg/gpu/nvidia/gpumanager.go index 98774f48..97c9a153 100644 --- a/pkg/gpu/nvidia/gpumanager.go +++ b/pkg/gpu/nvidia/gpumanager.go @@ -14,13 +14,15 @@ import ( type sharedGPUManager struct { enableMPS bool healthCheck bool + mpspipe string } -func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager { +func NewSharedGPUManager(enableMPS, healthCheck bool, mpspipe string, bp MemoryUnit) *sharedGPUManager { metric = bp return &sharedGPUManager{ enableMPS: enableMPS, healthCheck: healthCheck, + mpspipe: mpspipe, } } @@ -61,7 +63,7 @@ L: devicePlugin.Stop() } - devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck) + devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.mpspipe) if err := devicePlugin.Serve(); err != nil { log.Warningf("Failed to start device plugin due to %v", err) } else { diff --git a/pkg/gpu/nvidia/server.go b/pkg/gpu/nvidia/server.go index f09e9124..32c10cb5 100644 --- a/pkg/gpu/nvidia/server.go +++ b/pkg/gpu/nvidia/server.go @@ -22,6 +22,7 @@ type NvidiaDevicePlugin struct { devIndxMap map[uint]string socket string mps bool + mpspipe string healthCheck bool stop chan struct{} @@ -32,7 +33,7 @@ type NvidiaDevicePlugin struct { } // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin { +func NewNvidiaDevicePlugin(mps, healthCheck bool, mpspipe string) *NvidiaDevicePlugin { devs, devNameMap := getDevices() devList := []string{} @@ -54,10 +55,10 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin { devNameMap: devNameMap, socket: serverSock, mps: mps, + mpspipe: mpspipe, healthCheck: healthCheck, - - stop: make(chan struct{}), - health: make(chan *pluginapi.Device), + stop: make(chan struct{}), + health: make(chan *pluginapi.Device), } }