Add support for nvidia-gpu

This commit is contained in:
Alexandre Ferreira 2020-06-09 16:15:51 -05:00
parent 06a5d1129c
commit 633128015b
17 changed files with 519 additions and 139 deletions

64
client-alsa.yaml.template Normal file
View File

@ -0,0 +1,64 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -d /dev/snd ]
then
echo "No sound directory available (/dev/snd)"
exit 1
fi
apk add alsa-utils
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
for i in 1 2 3 4 5 6 7 8 9 10
do
sleep 20
done
exit $?
fi
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
exit $?
fi
RESULT=$(aplay -L)
if [ $? -gt 0 ]
then
echo "Could not execute aplay"
exit $?
fi
NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l)
if [ ${NL} -ne 2 ]
then
echo "Aplay did not find the correct device check:"
echo "${RESULT}"
exit 11
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
requests:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
terminationGracePeriodSeconds: 10

View File

@ -0,0 +1,36 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-nvidia-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-nvidia-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -e /dev/nvhost-gpu ]
then
echo "No nvidia GPU available (/dev/nvhost-gpu)"
exit 1
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
requests:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
terminationGracePeriodSeconds: 10

View File

@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager
IMAGE_NAME="smarter-device-manager" IMAGE_NAME="smarter-device-manager"
DIRECTORY_TO_RUN=. DIRECTORY_TO_RUN=.
ARCHS="linux/arm/v7" ARCHS="linux/arm64"
# Variable defaults # Variable defaults
FLAG_UPLOADIMAGES=1 FLAG_UPLOADIMAGES=0
FLAG_USESQUASH=0 FLAG_USESQUASH=0
FLAG_UPLOADMANIFEST=1 FLAG_UPLOADMANIFEST=1
ADDITIONAL_TAG="" ADDITIONAL_TAG=""
@ -96,6 +96,8 @@ fi
if [ $FLAG_UPLOADIMAGES -gt 0 ] if [ $FLAG_UPLOADIMAGES -gt 0 ]
then then
PUSH_OPTION="--push" PUSH_OPTION="--push"
else
PUSH_OPTION="--load"
fi fi
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} . docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .

View File

@ -202,7 +202,7 @@ L:
break break
} }
case nvidiaSysType : case nvidiaSysType :
devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId)
if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { if err = devicesInUse.devicePluginNvidia.Serve(); err != nil {
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break break

View File

@ -1,29 +1,16 @@
/* // Copyright (c) 2019, Arm Ltd
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package main package main
import ( import (
"flag" "flag"
"log"
"net" "net"
"os" "os"
"path" "path"
"strings"
"time" "time"
"github.com/golang/glog"
"golang.org/x/net/context" "golang.org/x/net/context"
"google.golang.org/grpc" "google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
@ -34,124 +21,99 @@ var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of De
// NvidiaDevicePlugin implements the Kubernetes device plugin API // NvidiaDevicePlugin implements the Kubernetes device plugin API
type NvidiaDevicePlugin struct { type NvidiaDevicePlugin struct {
devs []*pluginapi.Device devs []*pluginapi.Device
socket string
resourceName string resourceName string
allocateEnvvar string allocateEnvvar string
socket string
id string id string
server *grpc.Server
stop chan interface{} stop chan interface{}
health chan *pluginapi.Device health chan *pluginapi.Device
server *grpc.Server
} }
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
return &NvidiaDevicePlugin{ return &NvidiaDevicePlugin{
devs: getDevices(nDevices),
resourceName: resourceName, resourceName: resourceName,
allocateEnvvar: allocateEnvvar, allocateEnvvar: allocateEnvvar,
socket: socket, socket: socket,
id: id, id: id,
stop: make(chan interface{}),
health: make(chan *pluginapi.Device),
} }
} }
func (m *NvidiaDevicePlugin) initialize() { // dial establishes the gRPC communication with the registered device plugin.
m.server = grpc.NewServer([]grpc.ServerOption{}...) func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(timeout),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, err
}
return c, nil
} }
func (m *NvidiaDevicePlugin) cleanup() { // Start the gRPC server of the device plugin
}
// Start starts the gRPC server, registers the device plugin with the Kubelet,
// and starts the device healthchecks.
func (m *NvidiaDevicePlugin) Start() error { func (m *NvidiaDevicePlugin) Start() error {
m.initialize() glog.V(0).Info("Initializing nvidia device manager")
err := m.cleanup()
err := m.Serve()
if err != nil { if err != nil {
log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err)
m.cleanup()
return err return err
} }
log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket)
err = m.Register() glog.V(0).Info("Opening nvidia device manager socket ", m.socket)
if err != nil {
log.Printf("Could not register device plugin: %s", err)
m.Stop()
return err
}
log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName)
return nil
}
// Stop stops the gRPC server.
func (m *NvidiaDevicePlugin) Stop() error {
if m == nil || m.server == nil {
return nil
}
log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket)
m.server.Stop()
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
m.cleanup()
return nil
}
// Serve starts the gRPC server of the device plugin.
func (m *NvidiaDevicePlugin) Serve() error {
sock, err := net.Listen("unix", m.socket) sock, err := net.Listen("unix", m.socket)
if err != nil { if err != nil {
return err return err
} }
glog.V(0).Info("Socket opened nvidia device manager")
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m) pluginapi.RegisterDevicePluginServer(m.server, m)
glog.V(0).Info("gRPC server registered")
go func() { go m.server.Serve(sock)
lastCrashTime := time.Now() glog.V(0).Info("gRPC server running on socket")
restartCount := 0
for {
log.Printf("Starting GRPC server for '%s'", m.resourceName)
err := m.server.Serve(sock)
if err == nil {
break
}
log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err)
// restart if it has not been too often
// i.e. if server has crashed more than 5 times and it didn't last more than one hour each time
if restartCount > 5 {
// quit
log.Fatal("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName)
}
timeSinceLastCrash := time.Since(lastCrashTime).Seconds()
lastCrashTime = time.Now()
if timeSinceLastCrash > 3600 {
// it has been one hour since the last crash.. reset the count
// to reflect on the frequency
restartCount = 1
} else {
restartCount += 1
}
}
}()
// Wait for server to start by launching a blocking connexion // Wait for server to start by launching a blocking connexion
conn, err := m.dial(m.socket, 5*time.Second) conn, err := dialNvidia(m.socket, 60*time.Second)
if err != nil { if err != nil {
return err return err
} }
conn.Close() conn.Close()
glog.V(0).Info("gRPC Dial OK")
go m.healthcheck()
return nil return nil
} }
// Register registers the device plugin for the given resourceName with Kubelet. // Stop the gRPC server
func (m *NvidiaDevicePlugin) Register() error { func (m *NvidiaDevicePlugin) Stop() error {
conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
close(m.stop)
return m.cleanup()
}
// Register the device plugin for the given resourceName with Kubelet.
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
conn, err := dialNvidia(kubeletEndpoint, 5*time.Second)
if err != nil { if err != nil {
return err return err
} }
@ -161,7 +123,7 @@ func (m *NvidiaDevicePlugin) Register() error {
reqt := &pluginapi.RegisterRequest{ reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version, Version: pluginapi.Version,
Endpoint: path.Base(m.socket), Endpoint: path.Base(m.socket),
ResourceName: m.resourceName, ResourceName: resourceName,
} }
_, err = client.Register(context.Background(), reqt) _, err = client.Register(context.Background(), reqt)
@ -171,10 +133,6 @@ func (m *NvidiaDevicePlugin) Register() error {
return nil return nil
} }
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
return &pluginapi.DevicePluginOptions{}, nil
}
// ListAndWatch lists devices and update that list according to the health status // ListAndWatch lists devices and update that list according to the health status
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
@ -186,12 +144,15 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
case d := <-m.health: case d := <-m.health:
// FIXME: there is no way to recover from the Unhealthy state. // FIXME: there is no way to recover from the Unhealthy state.
d.Health = pluginapi.Unhealthy d.Health = pluginapi.Unhealthy
log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID)
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
} }
} }
} }
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
m.health <- dev
}
// Allocate which return list of devices. // Allocate which return list of devices.
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
responses := pluginapi.AllocateResponse{} responses := pluginapi.AllocateResponse{}
@ -221,38 +182,61 @@ func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreSt
return &pluginapi.PreStartContainerResponse{}, nil return &pluginapi.PreStartContainerResponse{}, nil
} }
// dial establishes the gRPC communication with the registered device plugin. func (m *NvidiaDevicePlugin) cleanup() error {
func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), return err
grpc.WithTimeout(timeout),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, err
} }
return c, nil return nil
} }
//func (m *NvidiaDevicePlugin) deviceExists(id string) bool { func (m *NvidiaDevicePlugin) healthcheck() {
// for _, d := range m.cachedDevices { disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
// if d.ID == id { if disableHealthChecks == "all" {
// return true disableHealthChecks = allHealthChecks
// } }
// }
// return false
//}
//func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { _, cancel := context.WithCancel(context.Background())
// var pdevs []*pluginapi.Device
// for _, d := range m.cachedDevices { var xids chan *pluginapi.Device
// pdevs = append(pdevs, &d.Device) if !strings.Contains(disableHealthChecks, "xids") {
// } xids = make(chan *pluginapi.Device)
// return pdevs }
//}
for {
select {
case <-m.stop:
cancel()
return
case dev := <-xids:
m.unhealthy(dev)
}
}
}
// Serve starts the gRPC server and register the device plugin to Kubelet
func (m *NvidiaDevicePlugin) Serve() error {
err := m.Start()
if err != nil {
glog.Errorf("Could not start device plugin: %s", err)
return err
}
glog.V(0).Info("Starting to serve on", m.socket)
err = m.Register(pluginapi.KubeletSocket, m.resourceName)
if err != nil {
glog.Errorf("Could not register device plugin: %s", err)
m.Stop()
return err
}
glog.V(0).Info("Registered device plugin with Kubelet")
return nil
}
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
return &pluginapi.DevicePluginOptions{}, nil
}
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec { func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
var specs []*pluginapi.DeviceSpec var specs []*pluginapi.DeviceSpec

View File

@ -0,0 +1,53 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management
namespace: default
spec:
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: smarter-device-management
nodeName: smarter-jetson-xavier-4bcc2584
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run> nodeName: <replace with node to run>
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -45,4 +45,4 @@ spec:
- name: sys-dir - name: sys-dir
hostPath: hostPath:
path: /sys path: /sys
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30

View File

@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run> nodeName: <replace with node to run>
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false

View File

@ -0,0 +1,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: smarter-device-manager-xavier
data:
conf.yaml: |
- devicematch: ^snd$
nummaxdevices: 20
- devicematch: ^gpiomem$
nummaxdevices: 40
- devicematch: ^gpiochip[0-9]*$
nummaxdevices: 20
- devicematch: ^hci[0-9]*$
nummaxdevices: 1
- devicematch: ^i2c-[0-9]*$
nummaxdevices: 1
- devicematch: ^rtc0$
nummaxdevices: 20
- devicematch: ^video[0-9]*$
nummaxdevices: 20
- devicematch: ^vchiq$
nummaxdevices: 20
- devicematch: ^vcsm.*$
nummaxdevices: 20
- devicematch: nvidia-gpu
nummaxdevices: 20

View File

@ -0,0 +1,48 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management
namespace: default
spec:
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: smarter-device-management
nodeName: smarter-jetson-xavier-4bcc2584
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@ -0,0 +1,75 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: config
mountPath: /root/config
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-rpi
- name: config
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config - name: config
mountPath: /root/config mountPath: /root/config
volumes: volumes:
@ -63,6 +65,9 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config - name: config
configMap: configMap:
name: smarter-device-manager-rpi name: smarter-device-manager-rpi

View File

@ -32,11 +32,9 @@ spec:
hostname: smarter-device-management hostname: smarter-device-management
hostNetwork: true hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +52,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
@ -61,4 +61,7 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config - name: config
mountPath: /root/config mountPath: /root/config
volumes: volumes:
@ -63,6 +65,9 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config - name: config
configMap: configMap:
name: smarter-device-manager-rpi name: smarter-device-manager-rpi

View File

@ -0,0 +1,74 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
@ -61,4 +63,7 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30