From 633128015b52922f479df9023d0c5ef549f4c5d0 Mon Sep 17 00:00:00 2001 From: Alexandre Ferreira Date: Tue, 9 Jun 2020 16:15:51 -0500 Subject: [PATCH] Add support for nvidia-gpu --- client-alsa.yaml.template | 64 +++++ client-nvidia.yaml.template | 36 +++ compile.sh | 6 +- conf.yaml | 2 +- main.go | 2 +- nvidia-server.go | 236 ++++++++---------- ...device-management-pod-k3s-test-xavier.yaml | 53 ++++ smarter-device-management-pod-k3s.yaml | 4 +- smarter-device-management-pod-k8s.yaml | 2 +- smarter-device-manager-configmap-xavier.yaml | 26 ++ smarter-device-manager-k3s-no-configmap.yaml | 48 ++++ ...device-manager-k3s-with-configmap-rpi.yaml | 75 ++++++ ...ice-manager-k3s-with-configmap-xavier.yaml | 7 +- smarter-device-manager-k3s.yaml | 9 +- ...device-manager-k8s-with-configmap-rpi.yaml | 7 +- ...ice-manager-k8s-with-configmap-xavier.yaml | 74 ++++++ smarter-device-manager-k8s.yaml | 7 +- 17 files changed, 519 insertions(+), 139 deletions(-) create mode 100644 client-alsa.yaml.template create mode 100644 client-nvidia.yaml.template create mode 100644 smarter-device-management-pod-k3s-test-xavier.yaml create mode 100644 smarter-device-manager-configmap-xavier.yaml create mode 100644 smarter-device-manager-k3s-no-configmap.yaml create mode 100644 smarter-device-manager-k3s-with-configmap-rpi.yaml rename smarter-device-manager-k3s-cs.yaml => smarter-device-manager-k3s-with-configmap-xavier.yaml (92%) rename smarter-device-manager-k8s-cs.yaml => smarter-device-manager-k8s-with-configmap-rpi.yaml (91%) create mode 100644 smarter-device-manager-k8s-with-configmap-xavier.yaml diff --git a/client-alsa.yaml.template b/client-alsa.yaml.template new file mode 100644 index 0000000..cada84a --- /dev/null +++ b/client-alsa.yaml.template @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -d /dev/snd ] + then + echo "No sound directory available (/dev/snd)" + exit 1 + fi + apk add alsa-utils + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + for i in 1 2 3 4 5 6 7 8 9 10 + do + sleep 20 + done + exit $? + fi + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + exit $? + fi + RESULT=$(aplay -L) + if [ $? -gt 0 ] + then + echo "Could not execute aplay" + exit $? + fi + NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l) + if [ ${NL} -ne 2 ] + then + echo "Aplay did not find the correct device check:" + echo "${RESULT}" + exit 11 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + terminationGracePeriodSeconds: 10 diff --git a/client-nvidia.yaml.template b/client-nvidia.yaml.template new file mode 100644 index 0000000..7949a34 --- /dev/null +++ b/client-nvidia.yaml.template @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-nvidia-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-nvidia-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -e /dev/nvhost-gpu ] + then + echo "No nvidia GPU available (/dev/nvhost-gpu)" + exit 1 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + terminationGracePeriodSeconds: 10 diff --git a/compile.sh b/compile.sh index 14262a0..8f6e123 100755 --- a/compile.sh +++ b/compile.sh @@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager IMAGE_NAME="smarter-device-manager" DIRECTORY_TO_RUN=. -ARCHS="linux/arm/v7" +ARCHS="linux/arm64" # Variable defaults -FLAG_UPLOADIMAGES=1 +FLAG_UPLOADIMAGES=0 FLAG_USESQUASH=0 FLAG_UPLOADMANIFEST=1 ADDITIONAL_TAG="" @@ -96,6 +96,8 @@ fi if [ $FLAG_UPLOADIMAGES -gt 0 ] then PUSH_OPTION="--push" +else + PUSH_OPTION="--load" fi docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} . diff --git a/conf.yaml b/conf.yaml index c786588..d7408e3 100644 --- a/conf.yaml +++ b/conf.yaml @@ -23,4 +23,4 @@ - devicematch: ^ttyTHS[0-9]*$ nummaxdevices: 1 - devicematch: ^ttyS[0-9]*$ - nummaxdevices: 1 \ No newline at end of file + nummaxdevices: 1 diff --git a/main.go b/main.go index 22df266..e03ff2f 100644 --- a/main.go +++ b/main.go @@ -202,7 +202,7 @@ L: break } case nvidiaSysType : - devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) + devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") break diff --git a/nvidia-server.go b/nvidia-server.go index baecb4c..4f0aa10 100644 --- a/nvidia-server.go +++ b/nvidia-server.go @@ -1,29 +1,16 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Copyright (c) 2019, Arm Ltd package main import ( - "flag" - "log" + "flag" "net" "os" "path" + "strings" "time" + "github.com/golang/glog" "golang.org/x/net/context" "google.golang.org/grpc" pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" @@ -34,124 +21,99 @@ var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of De // NvidiaDevicePlugin implements the Kubernetes device plugin API type NvidiaDevicePlugin struct { devs []*pluginapi.Device + socket string resourceName string allocateEnvvar string - socket string id string - server *grpc.Server + stop chan interface{} health chan *pluginapi.Device + server *grpc.Server } // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { +func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { return &NvidiaDevicePlugin{ + devs: getDevices(nDevices), resourceName: resourceName, allocateEnvvar: allocateEnvvar, socket: socket, id: id, + + stop: make(chan interface{}), + health: make(chan *pluginapi.Device), } } -func (m *NvidiaDevicePlugin) initialize() { - m.server = grpc.NewServer([]grpc.ServerOption{}...) +// dial establishes the gRPC communication with the registered device plugin. +func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), + grpc.WithTimeout(timeout), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + ) + + if err != nil { + return nil, err + } + + return c, nil } -func (m *NvidiaDevicePlugin) cleanup() { -} - -// Start starts the gRPC server, registers the device plugin with the Kubelet, -// and starts the device healthchecks. +// Start the gRPC server of the device plugin func (m *NvidiaDevicePlugin) Start() error { - m.initialize() - - err := m.Serve() + glog.V(0).Info("Initializing nvidia device manager") + err := m.cleanup() if err != nil { - log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err) - m.cleanup() return err } - log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket) - err = m.Register() - if err != nil { - log.Printf("Could not register device plugin: %s", err) - m.Stop() - return err - } - log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName) - - return nil -} - -// Stop stops the gRPC server. -func (m *NvidiaDevicePlugin) Stop() error { - if m == nil || m.server == nil { - return nil - } - log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket) - m.server.Stop() - if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { - return err - } - m.cleanup() - return nil -} - -// Serve starts the gRPC server of the device plugin. -func (m *NvidiaDevicePlugin) Serve() error { + glog.V(0).Info("Opening nvidia device manager socket ", m.socket) sock, err := net.Listen("unix", m.socket) if err != nil { return err } + glog.V(0).Info("Socket opened nvidia device manager") + m.server = grpc.NewServer([]grpc.ServerOption{}...) pluginapi.RegisterDevicePluginServer(m.server, m) + glog.V(0).Info("gRPC server registered") - go func() { - lastCrashTime := time.Now() - restartCount := 0 - for { - log.Printf("Starting GRPC server for '%s'", m.resourceName) - err := m.server.Serve(sock) - if err == nil { - break - } - - log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err) - - // restart if it has not been too often - // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time - if restartCount > 5 { - // quit - log.Fatal("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName) - } - timeSinceLastCrash := time.Since(lastCrashTime).Seconds() - lastCrashTime = time.Now() - if timeSinceLastCrash > 3600 { - // it has been one hour since the last crash.. reset the count - // to reflect on the frequency - restartCount = 1 - } else { - restartCount += 1 - } - } - }() + go m.server.Serve(sock) + glog.V(0).Info("gRPC server running on socket") // Wait for server to start by launching a blocking connexion - conn, err := m.dial(m.socket, 5*time.Second) + conn, err := dialNvidia(m.socket, 60*time.Second) if err != nil { return err } conn.Close() + glog.V(0).Info("gRPC Dial OK") + + go m.healthcheck() return nil } -// Register registers the device plugin for the given resourceName with Kubelet. -func (m *NvidiaDevicePlugin) Register() error { - conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) +// Stop the gRPC server +func (m *NvidiaDevicePlugin) Stop() error { + if m.server == nil { + return nil + } + + m.server.Stop() + m.server = nil + close(m.stop) + + return m.cleanup() +} + +// Register the device plugin for the given resourceName with Kubelet. +func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error { + conn, err := dialNvidia(kubeletEndpoint, 5*time.Second) if err != nil { return err } @@ -161,7 +123,7 @@ func (m *NvidiaDevicePlugin) Register() error { reqt := &pluginapi.RegisterRequest{ Version: pluginapi.Version, Endpoint: path.Base(m.socket), - ResourceName: m.resourceName, + ResourceName: resourceName, } _, err = client.Register(context.Background(), reqt) @@ -171,10 +133,6 @@ func (m *NvidiaDevicePlugin) Register() error { return nil } -func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { - return &pluginapi.DevicePluginOptions{}, nil -} - // ListAndWatch lists devices and update that list according to the health status func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) @@ -186,12 +144,15 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device case d := <-m.health: // FIXME: there is no way to recover from the Unhealthy state. d.Health = pluginapi.Unhealthy - log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID) s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) } } } +func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) { + m.health <- dev +} + // Allocate which return list of devices. func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { responses := pluginapi.AllocateResponse{} @@ -221,38 +182,61 @@ func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreSt return &pluginapi.PreStartContainerResponse{}, nil } -// dial establishes the gRPC communication with the registered device plugin. -func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { - c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), - grpc.WithTimeout(timeout), - grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { - return net.DialTimeout("unix", addr, timeout) - }), - ) - - if err != nil { - return nil, err +func (m *NvidiaDevicePlugin) cleanup() error { + if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { + return err } - return c, nil + return nil } -//func (m *NvidiaDevicePlugin) deviceExists(id string) bool { -// for _, d := range m.cachedDevices { -// if d.ID == id { -// return true -// } -// } -// return false -//} +func (m *NvidiaDevicePlugin) healthcheck() { + disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) + if disableHealthChecks == "all" { + disableHealthChecks = allHealthChecks + } -//func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { -// var pdevs []*pluginapi.Device -// for _, d := range m.cachedDevices { -// pdevs = append(pdevs, &d.Device) -// } -// return pdevs -//} + _, cancel := context.WithCancel(context.Background()) + + var xids chan *pluginapi.Device + if !strings.Contains(disableHealthChecks, "xids") { + xids = make(chan *pluginapi.Device) + } + + for { + select { + case <-m.stop: + cancel() + return + case dev := <-xids: + m.unhealthy(dev) + } + } +} + +// Serve starts the gRPC server and register the device plugin to Kubelet +func (m *NvidiaDevicePlugin) Serve() error { + err := m.Start() + if err != nil { + glog.Errorf("Could not start device plugin: %s", err) + return err + } + glog.V(0).Info("Starting to serve on", m.socket) + + err = m.Register(pluginapi.KubeletSocket, m.resourceName) + if err != nil { + glog.Errorf("Could not register device plugin: %s", err) + m.Stop() + return err + } + glog.V(0).Info("Registered device plugin with Kubelet") + + return nil +} + +func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { + return &pluginapi.DevicePluginOptions{}, nil +} func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec { var specs []*pluginapi.DeviceSpec diff --git a/smarter-device-management-pod-k3s-test-xavier.yaml b/smarter-device-management-pod-k3s-test-xavier.yaml new file mode 100644 index 0000000..1499e9f --- /dev/null +++ b/smarter-device-management-pod-k3s-test-xavier.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k3s.yaml b/smarter-device-management-pod-k3s.yaml index b4621de..d600afb 100644 --- a/smarter-device-management-pod-k3s.yaml +++ b/smarter-device-management-pod-k3s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -45,4 +45,4 @@ spec: - name: sys-dir hostPath: path: /sys - terminationGracePeriodSeconds: 30 + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k8s.yaml b/smarter-device-management-pod-k8s.yaml index 196f982..b499716 100644 --- a/smarter-device-management-pod-k8s.yaml +++ b/smarter-device-management-pod-k8s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false diff --git a/smarter-device-manager-configmap-xavier.yaml b/smarter-device-manager-configmap-xavier.yaml new file mode 100644 index 0000000..d1c38a9 --- /dev/null +++ b/smarter-device-manager-configmap-xavier.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: smarter-device-manager-xavier +data: + conf.yaml: | + - devicematch: ^snd$ + nummaxdevices: 20 + - devicematch: ^gpiomem$ + nummaxdevices: 40 + - devicematch: ^gpiochip[0-9]*$ + nummaxdevices: 20 + - devicematch: ^hci[0-9]*$ + nummaxdevices: 1 + - devicematch: ^i2c-[0-9]*$ + nummaxdevices: 1 + - devicematch: ^rtc0$ + nummaxdevices: 20 + - devicematch: ^video[0-9]*$ + nummaxdevices: 20 + - devicematch: ^vchiq$ + nummaxdevices: 20 + - devicematch: ^vcsm.*$ + nummaxdevices: 20 + - devicematch: nvidia-gpu + nummaxdevices: 20 diff --git a/smarter-device-manager-k3s-no-configmap.yaml b/smarter-device-manager-k3s-no-configmap.yaml new file mode 100644 index 0000000..3c73069 --- /dev/null +++ b/smarter-device-manager-k3s-no-configmap.yaml @@ -0,0 +1,48 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-with-configmap-rpi.yaml b/smarter-device-manager-k3s-with-configmap-rpi.yaml new file mode 100644 index 0000000..13da61d --- /dev/null +++ b/smarter-device-manager-k3s-with-configmap-rpi.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: config + mountPath: /root/config + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-rpi + - name: config + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-cs.yaml b/smarter-device-manager-k3s-with-configmap-xavier.yaml similarity index 92% rename from smarter-device-manager-k3s-cs.yaml rename to smarter-device-manager-k3s-with-configmap-xavier.yaml index e508bf4..afb1aa8 100644 --- a/smarter-device-manager-k3s-cs.yaml +++ b/smarter-device-manager-k3s-with-configmap-xavier.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k3s.yaml b/smarter-device-manager-k3s.yaml index 29e836a..70d5d6f 100644 --- a/smarter-device-manager-k3s.yaml +++ b/smarter-device-manager-k3s.yaml @@ -32,11 +32,9 @@ spec: hostname: smarter-device-management hostNetwork: true dnsPolicy: ClusterFirstWithHostNet - imagePullSecrets: - - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +52,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +61,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s-cs.yaml b/smarter-device-manager-k8s-with-configmap-rpi.yaml similarity index 91% rename from smarter-device-manager-k8s-cs.yaml rename to smarter-device-manager-k8s-with-configmap-rpi.yaml index f841227..37c5dd8 100644 --- a/smarter-device-manager-k8s-cs.yaml +++ b/smarter-device-manager-k8s-with-configmap-rpi.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k8s-with-configmap-xavier.yaml b/smarter-device-manager-k8s-with-configmap-xavier.yaml new file mode 100644 index 0000000..ae861b5 --- /dev/null +++ b/smarter-device-manager-k8s-with-configmap-xavier.yaml @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + imagePullSecrets: + - name: k8sedgeregcred + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s.yaml b/smarter-device-manager-k8s.yaml index f38afc7..12c67be 100644 --- a/smarter-device-manager-k8s.yaml +++ b/smarter-device-manager-k8s.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +63,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30