diff --git a/client-alsa.yaml.template b/client-alsa.yaml.template new file mode 100644 index 0000000..cada84a --- /dev/null +++ b/client-alsa.yaml.template @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -d /dev/snd ] + then + echo "No sound directory available (/dev/snd)" + exit 1 + fi + apk add alsa-utils + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + for i in 1 2 3 4 5 6 7 8 9 10 + do + sleep 20 + done + exit $? + fi + if [ $? -gt 0 ] + then + echo "Could not install alsa-utils" + exit $? + fi + RESULT=$(aplay -L) + if [ $? -gt 0 ] + then + echo "Could not execute aplay" + exit $? + fi + NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l) + if [ ${NL} -ne 2 ] + then + echo "Aplay did not find the correct device check:" + echo "${RESULT}" + exit 11 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/snd: 1 + terminationGracePeriodSeconds: 10 diff --git a/client-nvidia.yaml.template b/client-nvidia.yaml.template new file mode 100644 index 0000000..7949a34 --- /dev/null +++ b/client-nvidia.yaml.template @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management-nvidia-client + namespace: NAMESPACE +spec: + serviceAccountName: default + automountServiceAccountToken: false + dnsPolicy: ClusterFirstWithHostNet + hostname: yocto-test-client + nodeName: NODE_TO_TEST + restartPolicy: Never + containers: + - name: smarter-device-management-nvidia-client + imagePullPolicy: IfNotPresent + image: alpine + command: ["/bin/ash"] + args: + - "-c" + - | + if [ ! -e /dev/nvhost-gpu ] + then + echo "No nvidia GPU available (/dev/nvhost-gpu)" + exit 1 + fi + exit 0 + resources: + limits: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + requests: + cpu: 100m + memory: 100Mi + smarter-devices/nvidia-gpu0: 0 + terminationGracePeriodSeconds: 10 diff --git a/compile.sh b/compile.sh index 14262a0..8f6e123 100755 --- a/compile.sh +++ b/compile.sh @@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager IMAGE_NAME="smarter-device-manager" DIRECTORY_TO_RUN=. -ARCHS="linux/arm/v7" +ARCHS="linux/arm64" # Variable defaults -FLAG_UPLOADIMAGES=1 +FLAG_UPLOADIMAGES=0 FLAG_USESQUASH=0 FLAG_UPLOADMANIFEST=1 ADDITIONAL_TAG="" @@ -96,6 +96,8 @@ fi if [ $FLAG_UPLOADIMAGES -gt 0 ] then PUSH_OPTION="--push" +else + PUSH_OPTION="--load" fi docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} . diff --git a/conf.yaml b/conf.yaml index c786588..d7408e3 100644 --- a/conf.yaml +++ b/conf.yaml @@ -23,4 +23,4 @@ - devicematch: ^ttyTHS[0-9]*$ nummaxdevices: 1 - devicematch: ^ttyS[0-9]*$ - nummaxdevices: 1 \ No newline at end of file + nummaxdevices: 1 diff --git a/main.go b/main.go index 6f23064..e03ff2f 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,7 @@ package main import ( "flag" "fmt" + "strings" "os" "regexp" "syscall" @@ -18,13 +19,21 @@ import ( var confFileName string +const ( + deviceFileType uint = 0 + nvidiaSysType uint = 1 +) + type DeviceInstance struct { - devicePlugin *SmarterDevicePlugin + devicePluginSmarter *SmarterDevicePlugin + devicePluginNvidia *NvidiaDevicePlugin deviceName string socketName string deviceFile string numDevices uint + deviceType uint + deviceId string } type DesiredDevice struct { @@ -46,8 +55,8 @@ func init() { flag.Parse() } -func readDevDirectory() (files []string, err error) { - f, err := os.Open("/dev") +func readDevDirectory(dirToList string) (files []string, err error) { + f, err := os.Open(dirToList) if err != nil { return nil, err } @@ -93,34 +102,65 @@ func main() { } glog.V(0).Info("Reading existing devices on /dev") - ExistingDevices, err := readDevDirectory() + ExistingDevices, err := readDevDirectory("/dev") if err != nil { glog.Errorf(err.Error()) os.Exit(1) } + ExistingDevicesSys, err := readDevDirectory("/sys/devices") + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } var listDevicesAvailable []DeviceInstance for _, deviceToTest := range desiredDevices { - glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) - foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) - if err != nil { - glog.Errorf(err.Error()) - os.Exit(1) - } + if deviceToTest.DeviceMatch == "nvidia-gpu" { + glog.V(0).Infof("Checking nvidia devices") + foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*") + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } - // If found some create the devices entry - if len(foundDevices) > 0 { - for _, deviceToCreate := range foundDevices { - var newDevice DeviceInstance - newDevice.deviceName = "smarter-devices/" + deviceToCreate - newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock" - newDevice.deviceFile = "/dev/" + deviceToCreate - newDevice.numDevices = deviceToTest.NumMaxDevices - listDevicesAvailable = append(listDevicesAvailable, newDevice) - glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) - } - } + // If found some create the devices entry + if len(foundDevices) > 0 { + for _, deviceToCreate := range foundDevices { + var newDevice DeviceInstance + deviceId := strings.TrimPrefix(deviceToCreate,"gpu.") + newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId + newDevice.deviceId = deviceId + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock" + newDevice.deviceFile = deviceId + newDevice.numDevices = deviceToTest.NumMaxDevices + newDevice.deviceType = nvidiaSysType + listDevicesAvailable = append(listDevicesAvailable, newDevice) + glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) + } + } + } else { + glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch) + foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch) + if err != nil { + glog.Errorf(err.Error()) + os.Exit(1) + } + + // If found some create the devices entry + if len(foundDevices) > 0 { + for _, deviceToCreate := range foundDevices { + var newDevice DeviceInstance + newDevice.deviceType = deviceFileType + newDevice.deviceName = "smarter-devices/" + deviceToCreate + newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock" + newDevice.deviceFile = "/dev/" + deviceToCreate + newDevice.numDevices = deviceToTest.NumMaxDevices + listDevicesAvailable = append(listDevicesAvailable, newDevice) + glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch) + } + } + } } glog.V(0).Info("Starting FS watcher.") @@ -140,18 +180,34 @@ L: for { if restart { for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } var err error for _, devicesInUse := range listDevicesAvailable { - devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) - if err = devicesInUse.devicePlugin.Serve(); err != nil { - glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") - break - } + switch devicesInUse.deviceType { + case deviceFileType : + devicesInUse.devicePluginSmarter = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName) + if err = devicesInUse.devicePluginSmarter.Serve(); err != nil { + glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") + break + } + case nvidiaSysType : + devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId) + if err = devicesInUse.devicePluginNvidia.Serve(); err != nil { + glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") + break + } + } } if err != nil { continue @@ -178,9 +234,16 @@ L: default: glog.V(0).Infof("Received signal \"%v\", shutting down.", s) for _, devicesInUse := range listDevicesAvailable { - if devicesInUse.devicePlugin != nil { - devicesInUse.devicePlugin.Stop() - } + switch devicesInUse.deviceType { + case deviceFileType : + if devicesInUse.devicePluginSmarter != nil { + devicesInUse.devicePluginSmarter.Stop() + } + case nvidiaSysType : + if devicesInUse.devicePluginNvidia != nil { + devicesInUse.devicePluginNvidia.Stop() + } + } } break L } diff --git a/nvidia-server.go b/nvidia-server.go new file mode 100644 index 0000000..4f0aa10 --- /dev/null +++ b/nvidia-server.go @@ -0,0 +1,276 @@ +// Copyright (c) 2019, Arm Ltd + +package main + +import ( + "flag" + "net" + "os" + "path" + "strings" + "time" + + "github.com/golang/glog" + "golang.org/x/net/context" + "google.golang.org/grpc" + pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" +) + +var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of DeviceSpecs to the kubelet on Allocate()") + +// NvidiaDevicePlugin implements the Kubernetes device plugin API +type NvidiaDevicePlugin struct { + devs []*pluginapi.Device + socket string + resourceName string + allocateEnvvar string + id string + + + stop chan interface{} + health chan *pluginapi.Device + + server *grpc.Server +} + +// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin +func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin { + return &NvidiaDevicePlugin{ + devs: getDevices(nDevices), + resourceName: resourceName, + allocateEnvvar: allocateEnvvar, + socket: socket, + id: id, + + stop: make(chan interface{}), + health: make(chan *pluginapi.Device), + } +} + +// dial establishes the gRPC communication with the registered device plugin. +func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), + grpc.WithTimeout(timeout), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + ) + + if err != nil { + return nil, err + } + + return c, nil +} + +// Start the gRPC server of the device plugin +func (m *NvidiaDevicePlugin) Start() error { + glog.V(0).Info("Initializing nvidia device manager") + err := m.cleanup() + if err != nil { + return err + } + + glog.V(0).Info("Opening nvidia device manager socket ", m.socket) + sock, err := net.Listen("unix", m.socket) + if err != nil { + return err + } + glog.V(0).Info("Socket opened nvidia device manager") + + m.server = grpc.NewServer([]grpc.ServerOption{}...) + pluginapi.RegisterDevicePluginServer(m.server, m) + glog.V(0).Info("gRPC server registered") + + go m.server.Serve(sock) + glog.V(0).Info("gRPC server running on socket") + + // Wait for server to start by launching a blocking connexion + conn, err := dialNvidia(m.socket, 60*time.Second) + if err != nil { + return err + } + conn.Close() + glog.V(0).Info("gRPC Dial OK") + + go m.healthcheck() + + return nil +} + +// Stop the gRPC server +func (m *NvidiaDevicePlugin) Stop() error { + if m.server == nil { + return nil + } + + m.server.Stop() + m.server = nil + close(m.stop) + + return m.cleanup() +} + +// Register the device plugin for the given resourceName with Kubelet. +func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error { + conn, err := dialNvidia(kubeletEndpoint, 5*time.Second) + if err != nil { + return err + } + defer conn.Close() + + client := pluginapi.NewRegistrationClient(conn) + reqt := &pluginapi.RegisterRequest{ + Version: pluginapi.Version, + Endpoint: path.Base(m.socket), + ResourceName: resourceName, + } + + _, err = client.Register(context.Background(), reqt) + if err != nil { + return err + } + return nil +} + +// ListAndWatch lists devices and update that list according to the health status +func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { + s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + + for { + select { + case <-m.stop: + return nil + case d := <-m.health: + // FIXME: there is no way to recover from the Unhealthy state. + d.Health = pluginapi.Unhealthy + s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + } + } +} + +func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) { + m.health <- dev +} + +// Allocate which return list of devices. +func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + responses := pluginapi.AllocateResponse{} + for _, req := range reqs.ContainerRequests { + //for _, id := range req.DevicesIDs { + // if !m.deviceExists(id) { + // return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id) + // } + // + + response := pluginapi.ContainerAllocateResponse{ + Envs: map[string]string{ + m.allocateEnvvar: m.id, + }, + } + if *passDeviceSpecs { + response.Devices = m.apiDeviceSpecs(req.DevicesIDs) + } + + responses.ContainerResponses = append(responses.ContainerResponses, &response) + } + + return &responses, nil +} + +func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { + return &pluginapi.PreStartContainerResponse{}, nil +} + +func (m *NvidiaDevicePlugin) cleanup() error { + if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { + return err + } + + return nil +} + +func (m *NvidiaDevicePlugin) healthcheck() { + disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) + if disableHealthChecks == "all" { + disableHealthChecks = allHealthChecks + } + + _, cancel := context.WithCancel(context.Background()) + + var xids chan *pluginapi.Device + if !strings.Contains(disableHealthChecks, "xids") { + xids = make(chan *pluginapi.Device) + } + + for { + select { + case <-m.stop: + cancel() + return + case dev := <-xids: + m.unhealthy(dev) + } + } +} + +// Serve starts the gRPC server and register the device plugin to Kubelet +func (m *NvidiaDevicePlugin) Serve() error { + err := m.Start() + if err != nil { + glog.Errorf("Could not start device plugin: %s", err) + return err + } + glog.V(0).Info("Starting to serve on", m.socket) + + err = m.Register(pluginapi.KubeletSocket, m.resourceName) + if err != nil { + glog.Errorf("Could not register device plugin: %s", err) + m.Stop() + return err + } + glog.V(0).Info("Registered device plugin with Kubelet") + + return nil +} + +func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { + return &pluginapi.DevicePluginOptions{}, nil +} + +func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec { + var specs []*pluginapi.DeviceSpec + + paths := []string{ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + } + + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + spec := &pluginapi.DeviceSpec{ + ContainerPath: p, + HostPath: p, + Permissions: "rw", + } + specs = append(specs, spec) + } + } + +// for _, d := range m.devs { +// for _, id := range filter { +// if d.ID == id { +// spec := &pluginapi.DeviceSpec{ +// ContainerPath: d.Path, +// HostPath: d.Path, +// Permissions: "rw", +// } +// specs = append(specs, spec) +// } +// } +// } + + return specs +} diff --git a/server.go b/server.go index 7003b07..b0a75b3 100644 --- a/server.go +++ b/server.go @@ -37,7 +37,7 @@ type SmarterDevicePlugin struct { // NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin { return &SmarterDevicePlugin{ - devs: getDevices(uint(10)), + devs: getDevices(nDevices), socket: serverSock, deviceFile: deviceFilename, resourceName: resourceIdentification, diff --git a/smarter-device-management-pod-k3s-test-xavier.yaml b/smarter-device-management-pod-k3s-test-xavier.yaml new file mode 100644 index 0000000..1499e9f --- /dev/null +++ b/smarter-device-management-pod-k3s-test-xavier.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k3s.yaml b/smarter-device-management-pod-k3s.yaml index acbd566..d600afb 100644 --- a/smarter-device-management-pod-k3s.yaml +++ b/smarter-device-management-pod-k3s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -33,6 +33,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -40,4 +42,7 @@ spec: - name: dev-dir hostPath: path: /dev - terminationGracePeriodSeconds: 30 + - name: sys-dir + hostPath: + path: /sys + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-management-pod-k8s.yaml b/smarter-device-management-pod-k8s.yaml index 196f982..b499716 100644 --- a/smarter-device-management-pod-k8s.yaml +++ b/smarter-device-management-pod-k8s.yaml @@ -15,7 +15,7 @@ spec: nodeName: containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false diff --git a/smarter-device-manager-configmap-xavier.yaml b/smarter-device-manager-configmap-xavier.yaml new file mode 100644 index 0000000..d1c38a9 --- /dev/null +++ b/smarter-device-manager-configmap-xavier.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: smarter-device-manager-xavier +data: + conf.yaml: | + - devicematch: ^snd$ + nummaxdevices: 20 + - devicematch: ^gpiomem$ + nummaxdevices: 40 + - devicematch: ^gpiochip[0-9]*$ + nummaxdevices: 20 + - devicematch: ^hci[0-9]*$ + nummaxdevices: 1 + - devicematch: ^i2c-[0-9]*$ + nummaxdevices: 1 + - devicematch: ^rtc0$ + nummaxdevices: 20 + - devicematch: ^video[0-9]*$ + nummaxdevices: 20 + - devicematch: ^vchiq$ + nummaxdevices: 20 + - devicematch: ^vcsm.*$ + nummaxdevices: 20 + - devicematch: nvidia-gpu + nummaxdevices: 20 diff --git a/smarter-device-manager-k3s-no-configmap.yaml b/smarter-device-manager-k3s-no-configmap.yaml new file mode 100644 index 0000000..3c73069 --- /dev/null +++ b/smarter-device-manager-k3s-no-configmap.yaml @@ -0,0 +1,48 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smarter-device-management + namespace: default +spec: + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: smarter-device-management + nodeName: smarter-jetson-xavier-4bcc2584 + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-with-configmap-rpi.yaml b/smarter-device-manager-k3s-with-configmap-rpi.yaml new file mode 100644 index 0000000..13da61d --- /dev/null +++ b/smarter-device-manager-k3s-with-configmap-rpi.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: config + mountPath: /root/config + - name: sys-dir + mountPath: /sys + volumes: + - name: device-plugin + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-rpi + - name: config + hostPath: + path: /var/lib/rancher/k3s/agent/kubelet/device-plugins + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k3s-cs.yaml b/smarter-device-manager-k3s-with-configmap-xavier.yaml similarity index 92% rename from smarter-device-manager-k3s-cs.yaml rename to smarter-device-manager-k3s-with-configmap-xavier.yaml index e508bf4..afb1aa8 100644 --- a/smarter-device-manager-k3s-cs.yaml +++ b/smarter-device-manager-k3s-with-configmap-xavier.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k3s.yaml b/smarter-device-manager-k3s.yaml index 29e836a..70d5d6f 100644 --- a/smarter-device-manager-k3s.yaml +++ b/smarter-device-manager-k3s.yaml @@ -32,11 +32,9 @@ spec: hostname: smarter-device-management hostNetwork: true dnsPolicy: ClusterFirstWithHostNet - imagePullSecrets: - - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +52,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +61,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s-cs.yaml b/smarter-device-manager-k8s-with-configmap-rpi.yaml similarity index 91% rename from smarter-device-manager-k8s-cs.yaml rename to smarter-device-manager-k8s-with-configmap-rpi.yaml index f841227..37c5dd8 100644 --- a/smarter-device-manager-k8s-cs.yaml +++ b/smarter-device-manager-k8s-with-configmap-rpi.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys - name: config mountPath: /root/config volumes: @@ -63,6 +65,9 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys - name: config configMap: name: smarter-device-manager-rpi diff --git a/smarter-device-manager-k8s-with-configmap-xavier.yaml b/smarter-device-manager-k8s-with-configmap-xavier.yaml new file mode 100644 index 0000000..ae861b5 --- /dev/null +++ b/smarter-device-manager-k8s-with-configmap-xavier.yaml @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: < Replace with the namespace to use > + labels: + name: < Replace with the namespace to use > +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: smarter-device-manager + namespace: < Replace with the namespace to use > + labels: + name: smarter-device-manager + role: agent +spec: + selector: + matchLabels: + name: smarter-device-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: smarter-device-manager + annotations: + node.kubernetes.io/bootstrap-checkpoint: "true" + spec: + nodeSelector: + smarter-device-manager : enabled + priorityClassName: "system-node-critical" + hostname: smarter-device-management + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + imagePullSecrets: + - name: k8sedgeregcred + containers: + - name: smarter-device-manager + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 100m + memory: 15Mi + requests: + cpu: 10m + memory: 15Mi + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev-dir + mountPath: /dev + - name: sys-dir + mountPath: /sys + - name: config + mountPath: /root/config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev-dir + hostPath: + path: /dev + - name: sys-dir + hostPath: + path: /sys + - name: config + configMap: + name: smarter-device-manager-xavier + terminationGracePeriodSeconds: 30 diff --git a/smarter-device-manager-k8s.yaml b/smarter-device-manager-k8s.yaml index f38afc7..12c67be 100644 --- a/smarter-device-manager-k8s.yaml +++ b/smarter-device-manager-k8s.yaml @@ -36,7 +36,7 @@ spec: - name: k8sedgeregcred containers: - name: smarter-device-manager - image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613 + image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false @@ -54,6 +54,8 @@ spec: mountPath: /var/lib/kubelet/device-plugins - name: dev-dir mountPath: /dev + - name: sys-dir + mountPath: /sys volumes: - name: device-plugin hostPath: @@ -61,4 +63,7 @@ spec: - name: dev-dir hostPath: path: /dev + - name: sys-dir + hostPath: + path: /sys terminationGracePeriodSeconds: 30