From 94783dfc37fe311edbed35b04fd1914665350f01 Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Thu, 4 Jun 2020 14:11:13 -0500 Subject: [PATCH 1/4] WIP: for adding nvidia-gpu as a device --- main.go | 98 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 26 deletions(-) diff --git a/main.go b/main.go index 6f23064..d7892f0 100644 --- a/main.go +++ b/main.go @@ -18,6 +18,11 @@ import ( var confFileName string +const ( + deviceFileType int = 0 + nvidiaSysType int = 1 +) + type DeviceInstance struct { devicePlugin *SmarterDevicePlugin @@ -25,6 +30,7 @@ type DeviceInstance struct { socketName string deviceFile string numDevices uint + deviceType uint } type DesiredDevice struct { @@ -46,8 +52,8 @@ func init() { flag.Parse() } -func readDevDirectory() (files []string, err error) { - f, err := os.Open("/dev") +func readDevDirectory(dirToList string) (files []string, err error) { + f, err := os.Open(dirToList) if err != nil { return nil, err } @@ -93,34 +99,65 @@ func main() { } glog.V(0).Info("Reading existing devices on /dev") - ExistingDevices, err := readDevDirectory() + ExistingDevices, err := readDevDirectory("/dev") if err != nil { glog.Errorf(err.Error()) os.Exit(1) } + ExistingDevicesSys, err := readDevDirectory("/sys/devices") + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } var listDevicesAvailable []DeviceInstance for _, deviceToTest := range desiredDevices { - glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) - foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) - if err != nil { - glog.Errorf(err.Error()) - os.Exit(1) - } + if deviceToTest.DeviceMatch = "nvidia-gpu" { + glog.V(0).Infof("Checking nvidia devices") + foundDevices,err := findDevicesPattern(ExistingDevices, "gpu.[0-9]*") + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } - // If found some create the devices entry - if len(foundDevices) > 0 { - for _, deviceToCreate := range foundDevices { - var newDevice DeviceInstance - newDevice.deviceName = "smarter-devices/" + deviceToCreate - newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock" - newDevice.deviceFile = "/dev/" + deviceToCreate - newDevice.numDevices = deviceToTest.NumMaxDevices - listDevicesAvailable = append(listDevicesAvailable, newDevice) - glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) - } - } + // If found some create the devices entry + if len(foundDevices) > 0 { + for _, deviceToCreate := range foundDevices { + var newDevice DeviceInstance + deviceId := TrimPrefix(deviceToCreate,"gpu.") + newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + d"nvidia-gpu" + deviceId + ".sock" + newDevice.deviceFile = deviceId + newDevice.numDevices = deviceToTest.NumMaxDevices + newDevice.deviceType = nvidiaSysType + listDevicesAvailable = append(listDevicesAvailable, newDevice) + glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) + } + } + } + else { + glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) + foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } + + // If found some create the devices entry + if len(foundDevices) > 0 { + for _, deviceToCreate := range foundDevices { + var newDevice DeviceInstance + newDevice.deviceType = deviceFileType + newDevice.deviceName = "smarter-devices/" + deviceToCreate + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock" + newDevice.deviceFile = "/dev/" + deviceToCreate + newDevice.numDevices = deviceToTest.NumMaxDevices + listDevicesAvailable = append(listDevicesAvailable, newDevice) + glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) + } + } + } } glog.V(0).Info("Starting FS watcher.") @@ -147,11 +184,20 @@ L: var err error for _, devicesInUse := range listDevicesAvailable { - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { - glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") - break - } + switch devicesInUse.deviceType { + case deviceFileType : + devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) + if err = devicesInUse.devicePlugin.Serve(); err != nil { + glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") + break + } + case nvidiaSysType : + devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) + if err = devicesInUse.devicePlugin.Serve(); err != nil { + glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") + break + } + } } if err != nil { continue From 727594c3822418304bd59f1ee81ce91dbdbd8fbd Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Mon, 8 Jun 2020 14:45:14 -0500 Subject: [PATCH 2/4] New version of the nvidia GPU access --- main.go | 55 +++++++++++++++++--------- server.go | 2 +- smarter-device-management-pod-k3s.yaml | 5 +++ 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/main.go b/main.go index d7892f0..22df266 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,7 @@ package main import ( "flag" "fmt" + "strings" "os" "regexp" "syscall" @@ -19,18 +20,20 @@ import ( var confFileName string const ( - deviceFileType int = 0 - nvidiaSysType int = 1 + deviceFileType uint = 0 + nvidiaSysType uint = 1 ) type DeviceInstance struct { - devicePlugin *SmarterDevicePlugin + devicePluginSmarter *SmarterDevicePlugin + devicePluginNvidia *NvidiaDevicePlugin deviceName string socketName string deviceFile string numDevices uint deviceType uint + deviceId string } type DesiredDevice struct { @@ -113,9 +116,9 @@ func main() { var listDevicesAvailable []DeviceInstance for _, deviceToTest := range desiredDevices { - if deviceToTest.DeviceMatch = "nvidia-gpu" { + if deviceToTest.DeviceMatch == "nvidia-gpu" { glog.V(0).Infof("Checking nvidia devices") - foundDevices,err := findDevicesPattern(ExistingDevices, "gpu.[0-9]*") + foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*") if err != nil { glog.Errorf(err.Error()) os.Exit(1) @@ -125,9 +128,10 @@ func main() { if len(foundDevices) > 0 { for _, deviceToCreate := range foundDevices { var newDevice DeviceInstance - deviceId := TrimPrefix(deviceToCreate,"gpu.") + deviceId := strings.TrimPrefix(deviceToCreate,"gpu.") newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId - newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + d"nvidia-gpu" + deviceId + ".sock" + newDevice.deviceId = deviceId + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock" newDevice.deviceFile = deviceId newDevice.numDevices = deviceToTest.NumMaxDevices newDevice.deviceType = nvidiaSysType @@ -135,8 +139,7 @@ func main() { glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) } } - } - else { + } else { glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) if err != nil { @@ -177,23 +180,30 @@ L: for { if restart { for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } var err error for _, devicesInUse := range listDevicesAvailable { switch devicesInUse.deviceType { case deviceFileType : - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { + devicesInUse.devicePluginSmarter = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) + if err = devicesInUse.devicePluginSmarter.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break } case nvidiaSysType : - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { + devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) + if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break } @@ -224,9 +234,16 @@ L: default: glog.V(0).Infof("Received signal \"%v\", shutting down.", s) for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } break L } diff --git a/server.go b/server.go index 7003b07..b0a75b3 100644 --- a/server.go +++ b/server.go @@ -37,7 +37,7 @@ type SmarterDevicePlugin struct { // NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin { return &SmarterDevicePlugin{ - devs: getDevices(uint(10)), + devs: getDevices(nDevices), socket: serverSock, deviceFile: deviceFilename, resourceName: resourceIdentification, diff --git a/smarter-device-management-pod-k3s.yaml b/smarter-device-management-pod-k3s.yaml index acbd566..b4621de 100644 --- a/smarter-device-management-pod-k3s.yaml +++ b/smarter-device-management-pod-k3s.yaml @@ -33,6 +33,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -40,4 +42,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30 From 06a5d1129c3b542c0c1408ae0c7492749f01dd4b Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Mon, 8 Jun 2020 14:53:33 -0500 Subject: [PATCH 3/4] Forgot the nvidia portion of the server --- nvidia-server.go | 292 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 nvidia-server.go diff --git a/nvidia-server.go b/nvidia-server.go new file mode 100644 index 0000000..baecb4c --- /dev/null +++ b/nvidia-server.go @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "flag" + "log" + "net" + "os" + "path" + "time" + + "golang.org/x/net/context" + "google.golang.org/grpc" + pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" +) + +var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of DeviceSpecs to the kubelet on Allocate()") + +// NvidiaDevicePlugin implements the Kubernetes device plugin API +type NvidiaDevicePlugin struct { + devs []*pluginapi.Device + resourceName string + allocateEnvvar string + socket string + id string + + server *grpc.Server + stop chan interface{} + health chan *pluginapi.Device + +} + +// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin +func NewNvidiaDevicePlugin(resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { + return &NvidiaDevicePlugin{ + resourceName: resourceName, + allocateEnvvar: allocateEnvvar, + socket: socket, + id: id, + } +} + +func (m *NvidiaDevicePlugin) initialize() { + m.server = grpc.NewServer([]grpc.ServerOption{}...) +} + +func (m *NvidiaDevicePlugin) cleanup() { +} + +// Start starts the gRPC server, registers the device plugin with the Kubelet, +// and starts the device healthchecks. +func (m *NvidiaDevicePlugin) Start() error { + m.initialize() + + err := m.Serve() + if err != nil { + log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err) + m.cleanup() + return err + } + log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket) + + err = m.Register() + if err != nil { + log.Printf("Could not register device plugin: %s", err) + m.Stop() + return err + } + log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName) + + return nil +} + +// Stop stops the gRPC server. +func (m *NvidiaDevicePlugin) Stop() error { + if m == nil || m.server == nil { + return nil + } + log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket) + m.server.Stop() + if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { + return err + } + m.cleanup() + return nil +} + +// Serve starts the gRPC server of the device plugin. +func (m *NvidiaDevicePlugin) Serve() error { + sock, err := net.Listen("unix", m.socket) + if err != nil { + return err + } + + pluginapi.RegisterDevicePluginServer(m.server, m) + + go func() { + lastCrashTime := time.Now() + restartCount := 0 + for { + log.Printf("Starting GRPC server for '%s'", m.resourceName) + err := m.server.Serve(sock) + if err == nil { + break + } + + log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err) + + // restart if it has not been too often + // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time + if restartCount > 5 { + // quit + log.Fatal("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName) + } + timeSinceLastCrash := time.Since(lastCrashTime).Seconds() + lastCrashTime = time.Now() + if timeSinceLastCrash > 3600 { + // it has been one hour since the last crash.. reset the count + // to reflect on the frequency + restartCount = 1 + } else { + restartCount += 1 + } + } + }() + + // Wait for server to start by launching a blocking connexion + conn, err := m.dial(m.socket, 5*time.Second) + if err != nil { + return err + } + conn.Close() + + return nil +} + +// Register registers the device plugin for the given resourceName with Kubelet. +func (m *NvidiaDevicePlugin) Register() error { + conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) + if err != nil { + return err + } + defer conn.Close() + + client := pluginapi.NewRegistrationClient(conn) + reqt := &pluginapi.RegisterRequest{ + Version: pluginapi.Version, + Endpoint: path.Base(m.socket), + ResourceName: m.resourceName, + } + + _, err = client.Register(context.Background(), reqt) + if err != nil { + return err + } + return nil +} + +func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { + return &pluginapi.DevicePluginOptions{}, nil +} + +// ListAndWatch lists devices and update that list according to the health status +func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { + s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + + for { + select { + case <-m.stop: + return nil + case d := <-m.health: + // FIXME: there is no way to recover from the Unhealthy state. + d.Health = pluginapi.Unhealthy + log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID) + s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + } + } +} + +// Allocate which return list of devices. +func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + responses := pluginapi.AllocateResponse{} + for _, req := range reqs.ContainerRequests { + //for _, id := range req.DevicesIDs { + // if !m.deviceExists(id) { + // return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id) + // } + // + + response := pluginapi.ContainerAllocateResponse{ + Envs: map[string]string{ + m.allocateEnvvar: m.id, + }, + } + if *passDeviceSpecs { + response.Devices = m.apiDeviceSpecs(req.DevicesIDs) + } + + responses.ContainerResponses = append(responses.ContainerResponses, &response) + } + + return &responses, nil +} + +func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { + return &pluginapi.PreStartContainerResponse{}, nil +} + +// dial establishes the gRPC communication with the registered device plugin. +func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), + grpc.WithTimeout(timeout), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + ) + + if err != nil { + return nil, err + } + + return c, nil +} + +//func (m *NvidiaDevicePlugin) deviceExists(id string) bool { +// for _, d := range m.cachedDevices { +// if d.ID == id { +// return true +// } +// } +// return false +//} + +//func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { +// var pdevs []*pluginapi.Device +// for _, d := range m.cachedDevices { +// pdevs = append(pdevs, &d.Device) +// } +// return pdevs +//} + +func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec { + var specs []*pluginapi.DeviceSpec + + paths := []string{ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + } + + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + spec := &pluginapi.DeviceSpec{ + ContainerPath: p, + HostPath: p, + Permissions: "rw", + } + specs = append(specs, spec) + } + } + +// for _, d := range m.devs { +// for _, id := range filter { +// if d.ID == id { +// spec := &pluginapi.DeviceSpec{ +// ContainerPath: d.Path, +// HostPath: d.Path, +// Permissions: "rw", +// } +// specs = append(specs, spec) +// } +// } +// } + + return specs +} From 633128015b52922f479df9023d0c5ef549f4c5d0 Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Tue, 9 Jun 2020 16:15:51 -0500 Subject: [PATCH 4/4] Add support for nvidia-gpu --- client-alsa.yaml.template | 64 +++++ client-nvidia.yaml.template | 36 +++ compile.sh | 6 +- conf.yaml | 2 +- main.go | 2 +- nvidia-server.go | 236 ++++++++---------- ...device-management-pod-k3s-test-xavier.yaml | 53 ++++ smarter-device-management-pod-k3s.yaml | 4 +- smarter-device-management-pod-k8s.yaml | 2 +- smarter-device-manager-configmap-xavier.yaml | 26 ++ smarter-device-manager-k3s-no-configmap.yaml | 48 ++++ ...device-manager-k3s-with-configmap-rpi.yaml | 75 ++++++ ...ice-manager-k3s-with-configmap-xavier.yaml | 7 +- smarter-device-manager-k3s.yaml | 9 +- ...device-manager-k8s-with-configmap-rpi.yaml | 7 +- ...ice-manager-k8s-with-configmap-xavier.yaml | 74 ++++++ smarter-device-manager-k8s.yaml | 7 +- 17 files changed, 519 insertions(+), 139 deletions(-) create mode 100644 client-alsa.yaml.template create mode 100644 client-nvidia.yaml.template create mode 100644 smarter-device-management-pod-k3s-test-xavier.yaml create mode 100644 smarter-device-manager-configmap-xavier.yaml create mode 100644 smarter-device-manager-k3s-no-configmap.yaml create mode 100644 smarter-device-manager-k3s-with-configmap-rpi.yaml rename smarter-device-manager-k3s-cs.yaml => smarter-device-manager-k3s-with-configmap-xavier.yaml (92%) rename smarter-device-manager-k8s-cs.yaml => smarter-device-manager-k8s-with-configmap-rpi.yaml (91%) create mode 100644 smarter-device-manager-k8s-with-configmap-xavier.yaml diff --git a/client-alsa.yaml.template b/client-alsa.yaml.template new file mode 100644 index 0000000..cada84a --- /dev/null +++ b/client-alsa.yaml.template @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -d /dev/snd ] + then + echo "No sound directory available (/dev/snd)" + exit 1 + fi + apk add alsa-utils + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + for i in 1 2 3 4 5 6 7 8 9 10 + do + sleep 20 + done + exit $? + fi + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + exit $? + fi + RESULT=$(aplay -L) + if [ $? -gt 0 ] + then + echo "Could not execute aplay" + exit $? + fi + NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l) + if [ ${NL} -ne 2 ] + then + echo "Aplay did not find the correct device check:" + echo "${RESULT}" + exit 11 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + terminationGracePeriodSeconds: 10 diff --git a/client-nvidia.yaml.template b/client-nvidia.yaml.template new file mode 100644 index 0000000..7949a34 --- /dev/null +++ b/client-nvidia.yaml.template @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-nvidia-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-nvidia-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -e /dev/nvhost-gpu ] + then + echo "No nvidia GPU available (/dev/nvhost-gpu)" + exit 1 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + terminationGracePeriodSeconds: 10 diff --git a/compile.sh b/compile.sh index 14262a0..8f6e123 100755 --- a/compile.sh +++ b/compile.sh @@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager IMAGE_NAME="smarter-device-manager" DIRECTORY_TO_RUN=. -ARCHS="linux/arm/v7" +ARCHS="linux/arm64" # Variable defaults -FLAG_UPLOADIMAGES=1 +FLAG_UPLOADIMAGES=0 FLAG_USESQUASH=0 FLAG_UPLOADMANIFEST=1 ADDITIONAL_TAG="" @@ -96,6 +96,8 @@ fi if [ $FLAG_UPLOADIMAGES -gt 0 ] then PUSH_OPTION="--push" +else + PUSH_OPTION="--load" fi docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} . diff --git a/conf.yaml b/conf.yaml index c786588..d7408e3 100644 --- a/conf.yaml +++ b/conf.yaml @@ -23,4 +23,4 @@ - devicematch: ^ttyTHS[0-9]*$ nummaxdevices: 1 - devicematch: ^ttyS[0-9]*$ - nummaxdevices: 1 \ No newline at end of file + nummaxdevices: 1 diff --git a/main.go b/main.go index 22df266..e03ff2f 100644 --- a/main.go +++ b/main.go @@ -202,7 +202,7 @@ L: break } case nvidiaSysType : - devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) + devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break diff --git a/nvidia-server.go b/nvidia-server.go index baecb4c..4f0aa10 100644 --- a/nvidia-server.go +++ b/nvidia-server.go @@ -1,29 +1,16 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Copyright (c) 2019, Arm Ltd package main import ( - "flag" - "log" + "flag" "net" "os" "path" + "strings" "time" + "github.com/golang/glog" "golang.org/x/net/context" "google.golang.org/grpc" pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" @@ -34,124 +21,99 @@ var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of De // NvidiaDevicePlugin implements the Kubernetes device plugin API type NvidiaDevicePlugin struct { devs []*pluginapi.Device + socket string resourceName string allocateEnvvar string - socket string id string - server *grpc.Server + stop chan interface{} health chan *pluginapi.Device + server *grpc.Server } // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { +func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { return &NvidiaDevicePlugin{ + devs: getDevices(nDevices), resourceName: resourceName, allocateEnvvar: allocateEnvvar, socket: socket, id: id, + + stop: make(chan interface{}), + health: make(chan *pluginapi.Device), } } -func (m *NvidiaDevicePlugin) initialize() { - m.server = grpc.NewServer([]grpc.ServerOption{}...) +// dial establishes the gRPC communication with the registered device plugin. +func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), + grpc.WithTimeout(timeout), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + ) + + if err != nil { + return nil, err + } + + return c, nil } -func (m *NvidiaDevicePlugin) cleanup() { -} - -// Start starts the gRPC server, registers the device plugin with the Kubelet, -// and starts the device healthchecks. +// Start the gRPC server of the device plugin func (m *NvidiaDevicePlugin) Start() error { - m.initialize() - - err := m.Serve() + glog.V(0).Info("Initializing nvidia device manager") + err := m.cleanup() if err != nil { - log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err) - m.cleanup() return err } - log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket) - err = m.Register() - if err != nil { - log.Printf("Could not register device plugin: %s", err) - m.Stop() - return err - } - log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName) - - return nil -} - -// Stop stops the gRPC server. -func (m *NvidiaDevicePlugin) Stop() error { - if m == nil || m.server == nil { - return nil - } - log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket) - m.server.Stop() - if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { - return err - } - m.cleanup() - return nil -} - -// Serve starts the gRPC server of the device plugin. -func (m *NvidiaDevicePlugin) Serve() error { + glog.V(0).Info("Opening nvidia device manager socket ", m.socket) sock, err := net.Listen("unix", m.socket) if err != nil { return err } + glog.V(0).Info("Socket opened nvidia device manager") + m.server = grpc.NewServer([]grpc.ServerOption{}...) pluginapi.RegisterDevicePluginServer(m.server, m) + glog.V(0).Info("gRPC server registered") - go func() { - lastCrashTime := time.Now() - restartCount := 0 - for { - log.Printf("Starting GRPC server for '%s'", m.resourceName) - err := m.server.Serve(sock) - if err == nil { - break - } - - log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err) - - // restart if it has not been too often - // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time - if restartCount > 5 { - // quit - log.Fatal("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName) - } - timeSinceLastCrash := time.Since(lastCrashTime).Seconds() - lastCrashTime = time.Now() - if timeSinceLastCrash > 3600 { - // it has been one hour since the last crash.. reset the count - // to reflect on the frequency - restartCount = 1 - } else { - restartCount += 1 - } - } - }() + go m.server.Serve(sock) + glog.V(0).Info("gRPC server running on socket") // Wait for server to start by launching a blocking connexion - conn, err := m.dial(m.socket, 5*time.Second) + conn, err := dialNvidia(m.socket, 60*time.Second) if err != nil { return err } conn.Close() + glog.V(0).Info("gRPC Dial OK") + + go m.healthcheck() return nil } -// Register registers the device plugin for the given resourceName with Kubelet. -func (m *NvidiaDevicePlugin) Register() error { - conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) +// Stop the gRPC server +func (m *NvidiaDevicePlugin) Stop() error { + if m.server == nil { + return nil + } + + m.server.Stop() + m.server = nil + close(m.stop) + + return m.cleanup() +} + +// Register the device plugin for the given resourceName with Kubelet. +func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error { + conn, err := dialNvidia(kubeletEndpoint, 5*time.Second) if err != nil { return err } @@ -161,7 +123,7 @@ func (m *NvidiaDevicePlugin) Register() error { reqt := &pluginapi.RegisterRequest{ Version: pluginapi.Version, Endpoint: path.Base(m.socket), - ResourceName: m.resourceName, + ResourceName: resourceName, } _, err = client.Register(context.Background(), reqt) @@ -171,10 +133,6 @@ func (m *NvidiaDevicePlugin) Register() error { return nil } -func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { - return &pluginapi.DevicePluginOptions{}, nil -} - // ListAndWatch lists devices and update that list according to the health status func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) @@ -186,12 +144,15 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device case d := <-m.health: // FIXME: there is no way to recover from the Unhealthy state. d.Health = pluginapi.Unhealthy - log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID) s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) } } } +func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) { + m.health <- dev +} + // Allocate which return list of devices. func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { responses := pluginapi.AllocateResponse{} @@ -221,38 +182,61 @@ func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreSt return &pluginapi.PreStartContainerResponse{}, nil } -// dial establishes the gRPC communication with the registered device plugin. -func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { - c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), - grpc.WithTimeout(timeout), - grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { - return net.DialTimeout("unix", addr, timeout) - }), - ) - - if err != nil { - return nil, err +func (m *NvidiaDevicePlugin) cleanup() error { + if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { + return err } - return c, nil + return nil } -//func (m *NvidiaDevicePlugin) deviceExists(id string) bool { -// for _, d := range m.cachedDevices { -// if d.ID == id { -// return true -// } -// } -// return false -//} +func (m *NvidiaDevicePlugin) healthcheck() { + disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) + if disableHealthChecks == "all" { + disableHealthChecks = allHealthChecks + } -//func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { -// var pdevs []*pluginapi.Device -// for _, d := range m.cachedDevices { -// pdevs = append(pdevs, &d.Device) -// } -// return pdevs -//} + _, cancel := context.WithCancel(context.Background()) + + var xids chan *pluginapi.Device + if !strings.Contains(disableHealthChecks, "xids") { + xids = make(chan *pluginapi.Device) + } + + for { + select { + case <-m.stop: + cancel() + return + case dev := <-xids: + m.unhealthy(dev) + } + } +} + +// Serve starts the gRPC server and register the device plugin to Kubelet +func (m *NvidiaDevicePlugin) Serve() error { + err := m.Start() + if err != nil { + glog.Errorf("Could not start device plugin: %s", err) + return err + } + glog.V(0).Info("Starting to serve on", m.socket) + + err = m.Register(pluginapi.KubeletSocket, m.resourceName) + if err != nil { + glog.Errorf("Could not register device plugin: %s", err) + m.Stop() + return err + } + glog.V(0).Info("Registered device plugin with Kubelet") + + return nil +} + +func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { + return &pluginapi.DevicePluginOptions{}, nil +} func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec { var specs []*pluginapi.DeviceSpec diff --git a/smarter-device-management-pod-k3s-test-xavier.yaml b/smarter-device-management-pod-k3s-test-xavier.yaml new file mode 100644 index 0000000..1499e9f --- /dev/null +++ b/smarter-device-management-pod-k3s-test-xavier.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k3s.yaml b/smarter-device-management-pod-k3s.yaml index b4621de..d600afb 100644 --- a/smarter-device-management-pod-k3s.yaml +++ b/smarter-device-management-pod-k3s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -45,4 +45,4 @@ spec: - name: sys-dir hostPath: path: /sys - terminationGracePeriodSeconds: 30 + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k8s.yaml b/smarter-device-management-pod-k8s.yaml index 196f982..b499716 100644 --- a/smarter-device-management-pod-k8s.yaml +++ b/smarter-device-management-pod-k8s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false diff --git a/smarter-device-manager-configmap-xavier.yaml b/smarter-device-manager-configmap-xavier.yaml new file mode 100644 index 0000000..d1c38a9 --- /dev/null +++ b/smarter-device-manager-configmap-xavier.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: smarter-device-manager-xavier +data: + conf.yaml: | + - devicematch: ^snd$ + nummaxdevices: 20 + - devicematch: ^gpiomem$ + nummaxdevices: 40 + - devicematch: ^gpiochip[0-9]*$ + nummaxdevices: 20 + - devicematch: ^hci[0-9]*$ + nummaxdevices: 1 + - devicematch: ^i2c-[0-9]*$ + nummaxdevices: 1 + - devicematch: ^rtc0$ + nummaxdevices: 20 + - devicematch: ^video[0-9]*$ + nummaxdevices: 20 + - devicematch: ^vchiq$ + nummaxdevices: 20 + - devicematch: ^vcsm.*$ + nummaxdevices: 20 + - devicematch: nvidia-gpu + nummaxdevices: 20 diff --git a/smarter-device-manager-k3s-no-configmap.yaml b/smarter-device-manager-k3s-no-configmap.yaml new file mode 100644 index 0000000..3c73069 --- /dev/null +++ b/smarter-device-manager-k3s-no-configmap.yaml @@ -0,0 +1,48 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-with-configmap-rpi.yaml b/smarter-device-manager-k3s-with-configmap-rpi.yaml new file mode 100644 index 0000000..13da61d --- /dev/null +++ b/smarter-device-manager-k3s-with-configmap-rpi.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: config + mountPath: /root/config + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-rpi + - name: config + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-cs.yaml b/smarter-device-manager-k3s-with-configmap-xavier.yaml similarity index 92% rename from smarter-device-manager-k3s-cs.yaml rename to smarter-device-manager-k3s-with-configmap-xavier.yaml index e508bf4..afb1aa8 100644 --- a/smarter-device-manager-k3s-cs.yaml +++ b/smarter-device-manager-k3s-with-configmap-xavier.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k3s.yaml b/smarter-device-manager-k3s.yaml index 29e836a..70d5d6f 100644 --- a/smarter-device-manager-k3s.yaml +++ b/smarter-device-manager-k3s.yaml @@ -32,11 +32,9 @@ spec: hostname: smarter-device-management hostNetwork: true dnsPolicy: ClusterFirstWithHostNet - imagePullSecrets: - - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +52,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +61,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s-cs.yaml b/smarter-device-manager-k8s-with-configmap-rpi.yaml similarity index 91% rename from smarter-device-manager-k8s-cs.yaml rename to smarter-device-manager-k8s-with-configmap-rpi.yaml index f841227..37c5dd8 100644 --- a/smarter-device-manager-k8s-cs.yaml +++ b/smarter-device-manager-k8s-with-configmap-rpi.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k8s-with-configmap-xavier.yaml b/smarter-device-manager-k8s-with-configmap-xavier.yaml new file mode 100644 index 0000000..ae861b5 --- /dev/null +++ b/smarter-device-manager-k8s-with-configmap-xavier.yaml @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + imagePullSecrets: + - name: k8sedgeregcred + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s.yaml b/smarter-device-manager-k8s.yaml index f38afc7..12c67be 100644 --- a/smarter-device-manager-k8s.yaml +++ b/smarter-device-manager-k8s.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +63,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30