From 727594c3822418304bd59f1ee81ce91dbdbd8fbd Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Mon, 8 Jun 2020 14:45:14 -0500 Subject: [PATCH] New version of the nvidia GPU access --- main.go | 55 +++++++++++++++++--------- server.go | 2 +- smarter-device-management-pod-k3s.yaml | 5 +++ 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/main.go b/main.go index d7892f0..22df266 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,7 @@ package main import ( "flag" "fmt" + "strings" "os" "regexp" "syscall" @@ -19,18 +20,20 @@ import ( var confFileName string const ( - deviceFileType int = 0 - nvidiaSysType int = 1 + deviceFileType uint = 0 + nvidiaSysType uint = 1 ) type DeviceInstance struct { - devicePlugin *SmarterDevicePlugin + devicePluginSmarter *SmarterDevicePlugin + devicePluginNvidia *NvidiaDevicePlugin deviceName string socketName string deviceFile string numDevices uint deviceType uint + deviceId string } type DesiredDevice struct { @@ -113,9 +116,9 @@ func main() { var listDevicesAvailable []DeviceInstance for _, deviceToTest := range desiredDevices { - if deviceToTest.DeviceMatch = "nvidia-gpu" { + if deviceToTest.DeviceMatch == "nvidia-gpu" { glog.V(0).Infof("Checking nvidia devices") - foundDevices,err := findDevicesPattern(ExistingDevices, "gpu.[0-9]*") + foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*") if err != nil { glog.Errorf(err.Error()) os.Exit(1) @@ -125,9 +128,10 @@ func main() { if len(foundDevices) > 0 { for _, deviceToCreate := range foundDevices { var newDevice DeviceInstance - deviceId := TrimPrefix(deviceToCreate,"gpu.") + deviceId := strings.TrimPrefix(deviceToCreate,"gpu.") newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId - newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + d"nvidia-gpu" + deviceId + ".sock" + newDevice.deviceId = deviceId + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock" newDevice.deviceFile = deviceId newDevice.numDevices = deviceToTest.NumMaxDevices newDevice.deviceType = nvidiaSysType @@ -135,8 +139,7 @@ func main() { glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) } } - } - else { + } else { glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) if err != nil { @@ -177,23 +180,30 @@ L: for { if restart { for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } var err error for _, devicesInUse := range listDevicesAvailable { switch devicesInUse.deviceType { case deviceFileType : - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { + devicesInUse.devicePluginSmarter = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) + if err = devicesInUse.devicePluginSmarter.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break } case nvidiaSysType : - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { + devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) + if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break } @@ -224,9 +234,16 @@ L: default: glog.V(0).Infof("Received signal \"%v\", shutting down.", s) for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } break L } diff --git a/server.go b/server.go index 7003b07..b0a75b3 100644 --- a/server.go +++ b/server.go @@ -37,7 +37,7 @@ type SmarterDevicePlugin struct { // NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin { return &SmarterDevicePlugin{ - devs: getDevices(uint(10)), + devs: getDevices(nDevices), socket: serverSock, deviceFile: deviceFilename, resourceName: resourceIdentification, diff --git a/smarter-device-management-pod-k3s.yaml b/smarter-device-management-pod-k3s.yaml index acbd566..b4621de 100644 --- a/smarter-device-management-pod-k3s.yaml +++ b/smarter-device-management-pod-k3s.yaml @@ -33,6 +33,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -40,4 +42,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30