Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmd/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ var (
mps = flag.Bool("mps", false, "Enable or Disable MPS")
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
mpspipe = flag.String("mps-pipe", "/tmp/nvidia-mps", " pipes and UNIX domain sockets")
)

func main() {
flag.Parse()
log.V(1).Infoln("Start gpushare device plugin")
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit))
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *mpspipe, translatememoryUnits(*memoryUnit))
err := ngm.Run()
if err != nil {
log.Fatalf("Failed due to %v", err)
Expand Down
2 changes: 2 additions & 0 deletions device-plugin-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ spec:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
# - --mps-pipe=/root/nvidia-mps // mps-client and mps-server communicate through the directory.You can modify it.
# - --mps=true //if you want to use mps
- --memory-unit=GiB
resources:
limits:
Expand Down
29 changes: 21 additions & 8 deletions pkg/gpu/nvidia/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,22 @@ func init() {
kubeInit()
}

func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
func (m *NvidiaDevicePlugin) buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU),
envNVGPU: fmt.Sprintf("no-gpu-has-%dGiB-to-run", podReqGPU),
EnvResourceIndex: fmt.Sprintf("-1"),
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.mps {
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*uint(len(req.DevicesIDs))/getGPUMemory())
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
return &responses
Expand Down Expand Up @@ -62,7 +66,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
pods, err := getCandidatePods()
if err != nil {
log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

if log.V(4) {
Expand Down Expand Up @@ -106,7 +110,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
}

if id < 0 {
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

// 1. Create container requests
Expand All @@ -121,6 +125,15 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.mps {
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*reqGPU/getGPUMemory())
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
mount := pluginapi.Mount{
ContainerPath: m.mpspipe,
HostPath: m.mpspipe,
}
response.Mounts = append(response.Mounts, &mount)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}

Expand All @@ -134,25 +147,25 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{})
if err != nil {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
newPod = updatePodAnnotations(pod)
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
if err != nil {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
} else {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
}

} else {
log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
podReqGPU)
// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

log.Infof("new allocated GPUs info %v", &responses)
Expand Down
18 changes: 10 additions & 8 deletions pkg/gpu/nvidia/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@ const (
containerLogPathLabelKey = "io.kubernetes.container.logpath"
sandboxIDLabelKey = "io.kubernetes.sandbox.id"

envNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvMPSPipeDirectory = "CUDA_MPS_PIPE_DIRECTORY"
EnvMPSActiveThreadPercentage = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"

GiBPrefix = MemoryUnit("GiB")
MiBPrefix = MemoryUnit("MiB")
Expand Down
6 changes: 4 additions & 2 deletions pkg/gpu/nvidia/gpumanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ import (
type sharedGPUManager struct {
enableMPS bool
healthCheck bool
mpspipe string
}

func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager {
func NewSharedGPUManager(enableMPS, healthCheck bool, mpspipe string, bp MemoryUnit) *sharedGPUManager {
metric = bp
return &sharedGPUManager{
enableMPS: enableMPS,
healthCheck: healthCheck,
mpspipe: mpspipe,
}
}

Expand Down Expand Up @@ -61,7 +63,7 @@ L:
devicePlugin.Stop()
}

devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck)
devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.mpspipe)
if err := devicePlugin.Serve(); err != nil {
log.Warningf("Failed to start device plugin due to %v", err)
} else {
Expand Down
9 changes: 5 additions & 4 deletions pkg/gpu/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type NvidiaDevicePlugin struct {
devIndxMap map[uint]string
socket string
mps bool
mpspipe string
healthCheck bool

stop chan struct{}
Expand All @@ -32,7 +33,7 @@ type NvidiaDevicePlugin struct {
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
func NewNvidiaDevicePlugin(mps, healthCheck bool, mpspipe string) *NvidiaDevicePlugin {
devs, devNameMap := getDevices()
devList := []string{}

Expand All @@ -54,10 +55,10 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
devNameMap: devNameMap,
socket: serverSock,
mps: mps,
mpspipe: mpspipe,
healthCheck: healthCheck,

stop: make(chan struct{}),
health: make(chan *pluginapi.Device),
stop: make(chan struct{}),
health: make(chan *pluginapi.Device),
}
}

Expand Down