Merge branch 'add-nvidiagpu' into 'master'

Add nvidiagpu

See merge request arm-research/smarter/smarter-device-manager!9
This commit is contained in:
Alexandre Ferreira 2020-06-09 21:30:17 +00:00
commit 2e68d7793d
18 changed files with 786 additions and 46 deletions

64
client-alsa.yaml.template Normal file
View File

@ -0,0 +1,64 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -d /dev/snd ]
then
echo "No sound directory available (/dev/snd)"
exit 1
fi
apk add alsa-utils
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
for i in 1 2 3 4 5 6 7 8 9 10
do
sleep 20
done
exit $?
fi
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
exit $?
fi
RESULT=$(aplay -L)
if [ $? -gt 0 ]
then
echo "Could not execute aplay"
exit $?
fi
NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l)
if [ ${NL} -ne 2 ]
then
echo "Aplay did not find the correct device check:"
echo "${RESULT}"
exit 11
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
requests:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
terminationGracePeriodSeconds: 10

View File

@ -0,0 +1,36 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-nvidia-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-nvidia-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -e /dev/nvhost-gpu ]
then
echo "No nvidia GPU available (/dev/nvhost-gpu)"
exit 1
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
requests:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
terminationGracePeriodSeconds: 10

View File

@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager
IMAGE_NAME="smarter-device-manager" IMAGE_NAME="smarter-device-manager"
DIRECTORY_TO_RUN=. DIRECTORY_TO_RUN=.
ARCHS="linux/arm/v7" ARCHS="linux/arm64"
# Variable defaults # Variable defaults
FLAG_UPLOADIMAGES=1 FLAG_UPLOADIMAGES=0
FLAG_USESQUASH=0 FLAG_USESQUASH=0
FLAG_UPLOADMANIFEST=1 FLAG_UPLOADMANIFEST=1
ADDITIONAL_TAG="" ADDITIONAL_TAG=""
@ -96,6 +96,8 @@ fi
if [ $FLAG_UPLOADIMAGES -gt 0 ] if [ $FLAG_UPLOADIMAGES -gt 0 ]
then then
PUSH_OPTION="--push" PUSH_OPTION="--push"
else
PUSH_OPTION="--load"
fi fi
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} . docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .

View File

@ -23,4 +23,4 @@
- devicematch: ^ttyTHS[0-9]*$ - devicematch: ^ttyTHS[0-9]*$
nummaxdevices: 1 nummaxdevices: 1
- devicematch: ^ttyS[0-9]*$ - devicematch: ^ttyS[0-9]*$
nummaxdevices: 1 nummaxdevices: 1

129
main.go
View File

@ -5,6 +5,7 @@ package main
import ( import (
"flag" "flag"
"fmt" "fmt"
"strings"
"os" "os"
"regexp" "regexp"
"syscall" "syscall"
@ -18,13 +19,21 @@ import (
var confFileName string var confFileName string
const (
deviceFileType uint = 0
nvidiaSysType uint = 1
)
type DeviceInstance struct { type DeviceInstance struct {
devicePlugin *SmarterDevicePlugin devicePluginSmarter *SmarterDevicePlugin
devicePluginNvidia *NvidiaDevicePlugin
deviceName string deviceName string
socketName string socketName string
deviceFile string deviceFile string
numDevices uint numDevices uint
deviceType uint
deviceId string
} }
type DesiredDevice struct { type DesiredDevice struct {
@ -46,8 +55,8 @@ func init() {
flag.Parse() flag.Parse()
} }
func readDevDirectory() (files []string, err error) { func readDevDirectory(dirToList string) (files []string, err error) {
f, err := os.Open("/dev") f, err := os.Open(dirToList)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -93,34 +102,65 @@ func main() {
} }
glog.V(0).Info("Reading existing devices on /dev") glog.V(0).Info("Reading existing devices on /dev")
ExistingDevices, err := readDevDirectory() ExistingDevices, err := readDevDirectory("/dev")
if err != nil { if err != nil {
glog.Errorf(err.Error()) glog.Errorf(err.Error())
os.Exit(1) os.Exit(1)
} }
ExistingDevicesSys, err := readDevDirectory("/sys/devices")
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
var listDevicesAvailable []DeviceInstance var listDevicesAvailable []DeviceInstance
for _, deviceToTest := range desiredDevices { for _, deviceToTest := range desiredDevices {
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) if deviceToTest.DeviceMatch == "nvidia-gpu" {
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) glog.V(0).Infof("Checking nvidia devices")
if err != nil { foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*")
glog.Errorf(err.Error()) if err != nil {
os.Exit(1) glog.Errorf(err.Error())
} os.Exit(1)
}
// If found some create the devices entry // If found some create the devices entry
if len(foundDevices) > 0 { if len(foundDevices) > 0 {
for _, deviceToCreate := range foundDevices { for _, deviceToCreate := range foundDevices {
var newDevice DeviceInstance var newDevice DeviceInstance
newDevice.deviceName = "smarter-devices/" + deviceToCreate deviceId := strings.TrimPrefix(deviceToCreate,"gpu.")
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock" newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId
newDevice.deviceFile = "/dev/" + deviceToCreate newDevice.deviceId = deviceId
newDevice.numDevices = deviceToTest.NumMaxDevices newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock"
listDevicesAvailable = append(listDevicesAvailable, newDevice) newDevice.deviceFile = deviceId
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) newDevice.numDevices = deviceToTest.NumMaxDevices
} newDevice.deviceType = nvidiaSysType
} listDevicesAvailable = append(listDevicesAvailable, newDevice)
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
}
}
} else {
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch)
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch)
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
// If found some create the devices entry
if len(foundDevices) > 0 {
for _, deviceToCreate := range foundDevices {
var newDevice DeviceInstance
newDevice.deviceType = deviceFileType
newDevice.deviceName = "smarter-devices/" + deviceToCreate
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock"
newDevice.deviceFile = "/dev/" + deviceToCreate
newDevice.numDevices = deviceToTest.NumMaxDevices
listDevicesAvailable = append(listDevicesAvailable, newDevice)
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
}
}
}
} }
glog.V(0).Info("Starting FS watcher.") glog.V(0).Info("Starting FS watcher.")
@ -140,18 +180,34 @@ L:
for { for {
if restart { if restart {
for _, devicesInUse := range listDevicesAvailable { for _, devicesInUse := range listDevicesAvailable {
if devicesInUse.devicePlugin != nil { switch devicesInUse.deviceType {
devicesInUse.devicePlugin.Stop() case deviceFileType :
} if devicesInUse.devicePluginSmarter != nil {
devicesInUse.devicePluginSmarter.Stop()
}
case nvidiaSysType :
if devicesInUse.devicePluginNvidia != nil {
devicesInUse.devicePluginNvidia.Stop()
}
}
} }
var err error var err error
for _, devicesInUse := range listDevicesAvailable { for _, devicesInUse := range listDevicesAvailable {
devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) switch devicesInUse.deviceType {
if err = devicesInUse.devicePlugin.Serve(); err != nil { case deviceFileType :
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") devicesInUse.devicePluginSmarter = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName)
break if err = devicesInUse.devicePluginSmarter.Serve(); err != nil {
} glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break
}
case nvidiaSysType :
devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId)
if err = devicesInUse.devicePluginNvidia.Serve(); err != nil {
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break
}
}
} }
if err != nil { if err != nil {
continue continue
@ -178,9 +234,16 @@ L:
default: default:
glog.V(0).Infof("Received signal \"%v\", shutting down.", s) glog.V(0).Infof("Received signal \"%v\", shutting down.", s)
for _, devicesInUse := range listDevicesAvailable { for _, devicesInUse := range listDevicesAvailable {
if devicesInUse.devicePlugin != nil { switch devicesInUse.deviceType {
devicesInUse.devicePlugin.Stop() case deviceFileType :
} if devicesInUse.devicePluginSmarter != nil {
devicesInUse.devicePluginSmarter.Stop()
}
case nvidiaSysType :
if devicesInUse.devicePluginNvidia != nil {
devicesInUse.devicePluginNvidia.Stop()
}
}
} }
break L break L
} }

276
nvidia-server.go Normal file
View File

@ -0,0 +1,276 @@
// Copyright (c) 2019, Arm Ltd
package main
import (
"flag"
"net"
"os"
"path"
"strings"
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of DeviceSpecs to the kubelet on Allocate()")
// NvidiaDevicePlugin implements the Kubernetes device plugin API
type NvidiaDevicePlugin struct {
devs []*pluginapi.Device
socket string
resourceName string
allocateEnvvar string
id string
stop chan interface{}
health chan *pluginapi.Device
server *grpc.Server
}
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
return &NvidiaDevicePlugin{
devs: getDevices(nDevices),
resourceName: resourceName,
allocateEnvvar: allocateEnvvar,
socket: socket,
id: id,
stop: make(chan interface{}),
health: make(chan *pluginapi.Device),
}
}
// dial establishes the gRPC communication with the registered device plugin.
func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(timeout),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, err
}
return c, nil
}
// Start the gRPC server of the device plugin
func (m *NvidiaDevicePlugin) Start() error {
glog.V(0).Info("Initializing nvidia device manager")
err := m.cleanup()
if err != nil {
return err
}
glog.V(0).Info("Opening nvidia device manager socket ", m.socket)
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
glog.V(0).Info("Socket opened nvidia device manager")
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
glog.V(0).Info("gRPC server registered")
go m.server.Serve(sock)
glog.V(0).Info("gRPC server running on socket")
// Wait for server to start by launching a blocking connexion
conn, err := dialNvidia(m.socket, 60*time.Second)
if err != nil {
return err
}
conn.Close()
glog.V(0).Info("gRPC Dial OK")
go m.healthcheck()
return nil
}
// Stop the gRPC server
func (m *NvidiaDevicePlugin) Stop() error {
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
close(m.stop)
return m.cleanup()
}
// Register the device plugin for the given resourceName with Kubelet.
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
conn, err := dialNvidia(kubeletEndpoint, 5*time.Second)
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: path.Base(m.socket),
ResourceName: resourceName,
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
// ListAndWatch lists devices and update that list according to the health status
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case d := <-m.health:
// FIXME: there is no way to recover from the Unhealthy state.
d.Health = pluginapi.Unhealthy
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
}
}
}
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
m.health <- dev
}
// Allocate which return list of devices.
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
//for _, id := range req.DevicesIDs {
// if !m.deviceExists(id) {
// return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id)
// }
//
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
m.allocateEnvvar: m.id,
},
}
if *passDeviceSpecs {
response.Devices = m.apiDeviceSpecs(req.DevicesIDs)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
return &responses, nil
}
func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
return &pluginapi.PreStartContainerResponse{}, nil
}
func (m *NvidiaDevicePlugin) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func (m *NvidiaDevicePlugin) healthcheck() {
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
if disableHealthChecks == "all" {
disableHealthChecks = allHealthChecks
}
_, cancel := context.WithCancel(context.Background())
var xids chan *pluginapi.Device
if !strings.Contains(disableHealthChecks, "xids") {
xids = make(chan *pluginapi.Device)
}
for {
select {
case <-m.stop:
cancel()
return
case dev := <-xids:
m.unhealthy(dev)
}
}
}
// Serve starts the gRPC server and register the device plugin to Kubelet
func (m *NvidiaDevicePlugin) Serve() error {
err := m.Start()
if err != nil {
glog.Errorf("Could not start device plugin: %s", err)
return err
}
glog.V(0).Info("Starting to serve on", m.socket)
err = m.Register(pluginapi.KubeletSocket, m.resourceName)
if err != nil {
glog.Errorf("Could not register device plugin: %s", err)
m.Stop()
return err
}
glog.V(0).Info("Registered device plugin with Kubelet")
return nil
}
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
return &pluginapi.DevicePluginOptions{}, nil
}
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
var specs []*pluginapi.DeviceSpec
paths := []string{
"/dev/nvidiactl",
"/dev/nvidia-uvm",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-modeset",
}
for _, p := range paths {
if _, err := os.Stat(p); err == nil {
spec := &pluginapi.DeviceSpec{
ContainerPath: p,
HostPath: p,
Permissions: "rw",
}
specs = append(specs, spec)
}
}
// for _, d := range m.devs {
// for _, id := range filter {
// if d.ID == id {
// spec := &pluginapi.DeviceSpec{
// ContainerPath: d.Path,
// HostPath: d.Path,
// Permissions: "rw",
// }
// specs = append(specs, spec)
// }
// }
// }
return specs
}

View File

@ -37,7 +37,7 @@ type SmarterDevicePlugin struct {
// NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin // NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin
func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin { func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin {
return &SmarterDevicePlugin{ return &SmarterDevicePlugin{
devs: getDevices(uint(10)), devs: getDevices(nDevices),
socket: serverSock, socket: serverSock,
deviceFile: deviceFilename, deviceFile: deviceFilename,
resourceName: resourceIdentification, resourceName: resourceIdentification,

View File

@ -0,0 +1,53 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management
namespace: default
spec:
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: smarter-device-management
nodeName: smarter-jetson-xavier-4bcc2584
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run> nodeName: <replace with node to run>
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -33,6 +33,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
@ -40,4 +42,7 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
terminationGracePeriodSeconds: 30 - name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run> nodeName: <replace with node to run>
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false

View File

@ -0,0 +1,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: smarter-device-manager-xavier
data:
conf.yaml: |
- devicematch: ^snd$
nummaxdevices: 20
- devicematch: ^gpiomem$
nummaxdevices: 40
- devicematch: ^gpiochip[0-9]*$
nummaxdevices: 20
- devicematch: ^hci[0-9]*$
nummaxdevices: 1
- devicematch: ^i2c-[0-9]*$
nummaxdevices: 1
- devicematch: ^rtc0$
nummaxdevices: 20
- devicematch: ^video[0-9]*$
nummaxdevices: 20
- devicematch: ^vchiq$
nummaxdevices: 20
- devicematch: ^vcsm.*$
nummaxdevices: 20
- devicematch: nvidia-gpu
nummaxdevices: 20

View File

@ -0,0 +1,48 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management
namespace: default
spec:
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: smarter-device-management
nodeName: smarter-jetson-xavier-4bcc2584
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@ -0,0 +1,75 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: config
mountPath: /root/config
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-rpi
- name: config
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config - name: config
mountPath: /root/config mountPath: /root/config
volumes: volumes:
@ -63,6 +65,9 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config - name: config
configMap: configMap:
name: smarter-device-manager-rpi name: smarter-device-manager-rpi

View File

@ -32,11 +32,9 @@ spec:
hostname: smarter-device-management hostname: smarter-device-management
hostNetwork: true hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +52,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
@ -61,4 +61,7 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config - name: config
mountPath: /root/config mountPath: /root/config
volumes: volumes:
@ -63,6 +65,9 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config - name: config
configMap: configMap:
name: smarter-device-manager-rpi name: smarter-device-manager-rpi

View File

@ -0,0 +1,74 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@ -36,7 +36,7 @@ spec:
- name: k8sedgeregcred - name: k8sedgeregcred
containers: containers:
- name: smarter-device-manager - name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
@ -54,6 +54,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir - name: dev-dir
mountPath: /dev mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes: volumes:
- name: device-plugin - name: device-plugin
hostPath: hostPath:
@ -61,4 +63,7 @@ spec:
- name: dev-dir - name: dev-dir
hostPath: hostPath:
path: /dev path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30