mirror of
https://gitlab.com/arm-research/smarter/smarter-device-manager.git
synced 2024-11-21 18:23:34 +00:00
Add support for nvidia-gpu
This commit is contained in:
parent
06a5d1129c
commit
633128015b
64
client-alsa.yaml.template
Normal file
64
client-alsa.yaml.template
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-management-client
|
||||||
|
namespace: NAMESPACE
|
||||||
|
spec:
|
||||||
|
serviceAccountName: default
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
hostname: yocto-test-client
|
||||||
|
nodeName: NODE_TO_TEST
|
||||||
|
restartPolicy: Never
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-management-client
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
image: alpine
|
||||||
|
command: ["/bin/ash"]
|
||||||
|
args:
|
||||||
|
- "-c"
|
||||||
|
- |
|
||||||
|
if [ ! -d /dev/snd ]
|
||||||
|
then
|
||||||
|
echo "No sound directory available (/dev/snd)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
apk add alsa-utils
|
||||||
|
if [ $? -gt 0 ]
|
||||||
|
then
|
||||||
|
echo "Could not install alsa-utils"
|
||||||
|
for i in 1 2 3 4 5 6 7 8 9 10
|
||||||
|
do
|
||||||
|
sleep 20
|
||||||
|
done
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
if [ $? -gt 0 ]
|
||||||
|
then
|
||||||
|
echo "Could not install alsa-utils"
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
RESULT=$(aplay -L)
|
||||||
|
if [ $? -gt 0 ]
|
||||||
|
then
|
||||||
|
echo "Could not execute aplay"
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l)
|
||||||
|
if [ ${NL} -ne 2 ]
|
||||||
|
then
|
||||||
|
echo "Aplay did not find the correct device check:"
|
||||||
|
echo "${RESULT}"
|
||||||
|
exit 11
|
||||||
|
fi
|
||||||
|
exit 0
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
smarter-devices/snd: 1
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
smarter-devices/snd: 1
|
||||||
|
terminationGracePeriodSeconds: 10
|
36
client-nvidia.yaml.template
Normal file
36
client-nvidia.yaml.template
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-management-nvidia-client
|
||||||
|
namespace: NAMESPACE
|
||||||
|
spec:
|
||||||
|
serviceAccountName: default
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
hostname: yocto-test-client
|
||||||
|
nodeName: NODE_TO_TEST
|
||||||
|
restartPolicy: Never
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-management-nvidia-client
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
image: alpine
|
||||||
|
command: ["/bin/ash"]
|
||||||
|
args:
|
||||||
|
- "-c"
|
||||||
|
- |
|
||||||
|
if [ ! -e /dev/nvhost-gpu ]
|
||||||
|
then
|
||||||
|
echo "No nvidia GPU available (/dev/nvhost-gpu)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
exit 0
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
smarter-devices/nvidia-gpu0: 0
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
smarter-devices/nvidia-gpu0: 0
|
||||||
|
terminationGracePeriodSeconds: 10
|
@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager
|
|||||||
IMAGE_NAME="smarter-device-manager"
|
IMAGE_NAME="smarter-device-manager"
|
||||||
DIRECTORY_TO_RUN=.
|
DIRECTORY_TO_RUN=.
|
||||||
|
|
||||||
ARCHS="linux/arm/v7"
|
ARCHS="linux/arm64"
|
||||||
|
|
||||||
# Variable defaults
|
# Variable defaults
|
||||||
FLAG_UPLOADIMAGES=1
|
FLAG_UPLOADIMAGES=0
|
||||||
FLAG_USESQUASH=0
|
FLAG_USESQUASH=0
|
||||||
FLAG_UPLOADMANIFEST=1
|
FLAG_UPLOADMANIFEST=1
|
||||||
ADDITIONAL_TAG=""
|
ADDITIONAL_TAG=""
|
||||||
@ -96,6 +96,8 @@ fi
|
|||||||
if [ $FLAG_UPLOADIMAGES -gt 0 ]
|
if [ $FLAG_UPLOADIMAGES -gt 0 ]
|
||||||
then
|
then
|
||||||
PUSH_OPTION="--push"
|
PUSH_OPTION="--push"
|
||||||
|
else
|
||||||
|
PUSH_OPTION="--load"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .
|
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .
|
||||||
|
2
main.go
2
main.go
@ -202,7 +202,7 @@ L:
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
case nvidiaSysType :
|
case nvidiaSysType :
|
||||||
devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId)
|
devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId)
|
||||||
if err = devicesInUse.devicePluginNvidia.Serve(); err != nil {
|
if err = devicesInUse.devicePluginNvidia.Serve(); err != nil {
|
||||||
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
|
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
|
||||||
break
|
break
|
||||||
|
236
nvidia-server.go
236
nvidia-server.go
@ -1,29 +1,16 @@
|
|||||||
/*
|
// Copyright (c) 2019, Arm Ltd
|
||||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"log"
|
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/golang/glog"
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
|
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
|
||||||
@ -34,124 +21,99 @@ var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of De
|
|||||||
// NvidiaDevicePlugin implements the Kubernetes device plugin API
|
// NvidiaDevicePlugin implements the Kubernetes device plugin API
|
||||||
type NvidiaDevicePlugin struct {
|
type NvidiaDevicePlugin struct {
|
||||||
devs []*pluginapi.Device
|
devs []*pluginapi.Device
|
||||||
|
socket string
|
||||||
resourceName string
|
resourceName string
|
||||||
allocateEnvvar string
|
allocateEnvvar string
|
||||||
socket string
|
|
||||||
id string
|
id string
|
||||||
|
|
||||||
server *grpc.Server
|
|
||||||
stop chan interface{}
|
stop chan interface{}
|
||||||
health chan *pluginapi.Device
|
health chan *pluginapi.Device
|
||||||
|
|
||||||
|
server *grpc.Server
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
|
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
|
||||||
func NewNvidiaDevicePlugin(resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
|
func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
|
||||||
return &NvidiaDevicePlugin{
|
return &NvidiaDevicePlugin{
|
||||||
|
devs: getDevices(nDevices),
|
||||||
resourceName: resourceName,
|
resourceName: resourceName,
|
||||||
allocateEnvvar: allocateEnvvar,
|
allocateEnvvar: allocateEnvvar,
|
||||||
socket: socket,
|
socket: socket,
|
||||||
id: id,
|
id: id,
|
||||||
|
|
||||||
|
stop: make(chan interface{}),
|
||||||
|
health: make(chan *pluginapi.Device),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaDevicePlugin) initialize() {
|
// dial establishes the gRPC communication with the registered device plugin.
|
||||||
m.server = grpc.NewServer([]grpc.ServerOption{}...)
|
func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
|
||||||
|
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
|
||||||
|
grpc.WithTimeout(timeout),
|
||||||
|
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
|
||||||
|
return net.DialTimeout("unix", addr, timeout)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaDevicePlugin) cleanup() {
|
// Start the gRPC server of the device plugin
|
||||||
}
|
|
||||||
|
|
||||||
// Start starts the gRPC server, registers the device plugin with the Kubelet,
|
|
||||||
// and starts the device healthchecks.
|
|
||||||
func (m *NvidiaDevicePlugin) Start() error {
|
func (m *NvidiaDevicePlugin) Start() error {
|
||||||
m.initialize()
|
glog.V(0).Info("Initializing nvidia device manager")
|
||||||
|
err := m.cleanup()
|
||||||
err := m.Serve()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err)
|
|
||||||
m.cleanup()
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket)
|
|
||||||
|
|
||||||
err = m.Register()
|
glog.V(0).Info("Opening nvidia device manager socket ", m.socket)
|
||||||
if err != nil {
|
|
||||||
log.Printf("Could not register device plugin: %s", err)
|
|
||||||
m.Stop()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName)
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop stops the gRPC server.
|
|
||||||
func (m *NvidiaDevicePlugin) Stop() error {
|
|
||||||
if m == nil || m.server == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket)
|
|
||||||
m.server.Stop()
|
|
||||||
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
m.cleanup()
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Serve starts the gRPC server of the device plugin.
|
|
||||||
func (m *NvidiaDevicePlugin) Serve() error {
|
|
||||||
sock, err := net.Listen("unix", m.socket)
|
sock, err := net.Listen("unix", m.socket)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
glog.V(0).Info("Socket opened nvidia device manager")
|
||||||
|
|
||||||
|
m.server = grpc.NewServer([]grpc.ServerOption{}...)
|
||||||
pluginapi.RegisterDevicePluginServer(m.server, m)
|
pluginapi.RegisterDevicePluginServer(m.server, m)
|
||||||
|
glog.V(0).Info("gRPC server registered")
|
||||||
|
|
||||||
go func() {
|
go m.server.Serve(sock)
|
||||||
lastCrashTime := time.Now()
|
glog.V(0).Info("gRPC server running on socket")
|
||||||
restartCount := 0
|
|
||||||
for {
|
|
||||||
log.Printf("Starting GRPC server for '%s'", m.resourceName)
|
|
||||||
err := m.server.Serve(sock)
|
|
||||||
if err == nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err)
|
|
||||||
|
|
||||||
// restart if it has not been too often
|
|
||||||
// i.e. if server has crashed more than 5 times and it didn't last more than one hour each time
|
|
||||||
if restartCount > 5 {
|
|
||||||
// quit
|
|
||||||
log.Fatal("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName)
|
|
||||||
}
|
|
||||||
timeSinceLastCrash := time.Since(lastCrashTime).Seconds()
|
|
||||||
lastCrashTime = time.Now()
|
|
||||||
if timeSinceLastCrash > 3600 {
|
|
||||||
// it has been one hour since the last crash.. reset the count
|
|
||||||
// to reflect on the frequency
|
|
||||||
restartCount = 1
|
|
||||||
} else {
|
|
||||||
restartCount += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Wait for server to start by launching a blocking connexion
|
// Wait for server to start by launching a blocking connexion
|
||||||
conn, err := m.dial(m.socket, 5*time.Second)
|
conn, err := dialNvidia(m.socket, 60*time.Second)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
conn.Close()
|
conn.Close()
|
||||||
|
glog.V(0).Info("gRPC Dial OK")
|
||||||
|
|
||||||
|
go m.healthcheck()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register registers the device plugin for the given resourceName with Kubelet.
|
// Stop the gRPC server
|
||||||
func (m *NvidiaDevicePlugin) Register() error {
|
func (m *NvidiaDevicePlugin) Stop() error {
|
||||||
conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second)
|
if m.server == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
m.server.Stop()
|
||||||
|
m.server = nil
|
||||||
|
close(m.stop)
|
||||||
|
|
||||||
|
return m.cleanup()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register the device plugin for the given resourceName with Kubelet.
|
||||||
|
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
|
||||||
|
conn, err := dialNvidia(kubeletEndpoint, 5*time.Second)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -161,7 +123,7 @@ func (m *NvidiaDevicePlugin) Register() error {
|
|||||||
reqt := &pluginapi.RegisterRequest{
|
reqt := &pluginapi.RegisterRequest{
|
||||||
Version: pluginapi.Version,
|
Version: pluginapi.Version,
|
||||||
Endpoint: path.Base(m.socket),
|
Endpoint: path.Base(m.socket),
|
||||||
ResourceName: m.resourceName,
|
ResourceName: resourceName,
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = client.Register(context.Background(), reqt)
|
_, err = client.Register(context.Background(), reqt)
|
||||||
@ -171,10 +133,6 @@ func (m *NvidiaDevicePlugin) Register() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
|
|
||||||
return &pluginapi.DevicePluginOptions{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ListAndWatch lists devices and update that list according to the health status
|
// ListAndWatch lists devices and update that list according to the health status
|
||||||
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
|
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
|
||||||
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
||||||
@ -186,12 +144,15 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
|
|||||||
case d := <-m.health:
|
case d := <-m.health:
|
||||||
// FIXME: there is no way to recover from the Unhealthy state.
|
// FIXME: there is no way to recover from the Unhealthy state.
|
||||||
d.Health = pluginapi.Unhealthy
|
d.Health = pluginapi.Unhealthy
|
||||||
log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID)
|
|
||||||
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
|
||||||
|
m.health <- dev
|
||||||
|
}
|
||||||
|
|
||||||
// Allocate which return list of devices.
|
// Allocate which return list of devices.
|
||||||
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
|
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
|
||||||
responses := pluginapi.AllocateResponse{}
|
responses := pluginapi.AllocateResponse{}
|
||||||
@ -221,38 +182,61 @@ func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreSt
|
|||||||
return &pluginapi.PreStartContainerResponse{}, nil
|
return &pluginapi.PreStartContainerResponse{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// dial establishes the gRPC communication with the registered device plugin.
|
func (m *NvidiaDevicePlugin) cleanup() error {
|
||||||
func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
|
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
|
||||||
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
|
return err
|
||||||
grpc.WithTimeout(timeout),
|
|
||||||
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
|
|
||||||
return net.DialTimeout("unix", addr, timeout)
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return c, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//func (m *NvidiaDevicePlugin) deviceExists(id string) bool {
|
func (m *NvidiaDevicePlugin) healthcheck() {
|
||||||
// for _, d := range m.cachedDevices {
|
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
|
||||||
// if d.ID == id {
|
if disableHealthChecks == "all" {
|
||||||
// return true
|
disableHealthChecks = allHealthChecks
|
||||||
// }
|
}
|
||||||
// }
|
|
||||||
// return false
|
|
||||||
//}
|
|
||||||
|
|
||||||
//func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
|
_, cancel := context.WithCancel(context.Background())
|
||||||
// var pdevs []*pluginapi.Device
|
|
||||||
// for _, d := range m.cachedDevices {
|
var xids chan *pluginapi.Device
|
||||||
// pdevs = append(pdevs, &d.Device)
|
if !strings.Contains(disableHealthChecks, "xids") {
|
||||||
// }
|
xids = make(chan *pluginapi.Device)
|
||||||
// return pdevs
|
}
|
||||||
//}
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-m.stop:
|
||||||
|
cancel()
|
||||||
|
return
|
||||||
|
case dev := <-xids:
|
||||||
|
m.unhealthy(dev)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serve starts the gRPC server and register the device plugin to Kubelet
|
||||||
|
func (m *NvidiaDevicePlugin) Serve() error {
|
||||||
|
err := m.Start()
|
||||||
|
if err != nil {
|
||||||
|
glog.Errorf("Could not start device plugin: %s", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
glog.V(0).Info("Starting to serve on", m.socket)
|
||||||
|
|
||||||
|
err = m.Register(pluginapi.KubeletSocket, m.resourceName)
|
||||||
|
if err != nil {
|
||||||
|
glog.Errorf("Could not register device plugin: %s", err)
|
||||||
|
m.Stop()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
glog.V(0).Info("Registered device plugin with Kubelet")
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
|
||||||
|
return &pluginapi.DevicePluginOptions{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
|
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
|
||||||
var specs []*pluginapi.DeviceSpec
|
var specs []*pluginapi.DeviceSpec
|
||||||
|
53
smarter-device-management-pod-k3s-test-xavier.yaml
Normal file
53
smarter-device-management-pod-k3s-test-xavier.yaml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-management
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||||
|
# scheduler reserves resources for critical add-on pods so that they can
|
||||||
|
# be rescheduled after a failure.
|
||||||
|
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
hostNetwork: true
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
hostname: smarter-device-management
|
||||||
|
nodeName: smarter-jetson-xavier-4bcc2584
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-manager
|
||||||
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 10Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 10Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
|
- name: config
|
||||||
|
mountPath: /root/config
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: smarter-device-manager-xavier
|
||||||
|
terminationGracePeriodSeconds: 30
|
@ -15,7 +15,7 @@ spec:
|
|||||||
nodeName: <replace with node to run>
|
nodeName: <replace with node to run>
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
@ -45,4 +45,4 @@ spec:
|
|||||||
- name: sys-dir
|
- name: sys-dir
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /sys
|
path: /sys
|
||||||
terminationGracePeriodSeconds: 30
|
terminationGracePeriodSeconds: 30
|
||||||
|
@ -15,7 +15,7 @@ spec:
|
|||||||
nodeName: <replace with node to run>
|
nodeName: <replace with node to run>
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
|
26
smarter-device-manager-configmap-xavier.yaml
Normal file
26
smarter-device-manager-configmap-xavier.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-manager-xavier
|
||||||
|
data:
|
||||||
|
conf.yaml: |
|
||||||
|
- devicematch: ^snd$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: ^gpiomem$
|
||||||
|
nummaxdevices: 40
|
||||||
|
- devicematch: ^gpiochip[0-9]*$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: ^hci[0-9]*$
|
||||||
|
nummaxdevices: 1
|
||||||
|
- devicematch: ^i2c-[0-9]*$
|
||||||
|
nummaxdevices: 1
|
||||||
|
- devicematch: ^rtc0$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: ^video[0-9]*$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: ^vchiq$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: ^vcsm.*$
|
||||||
|
nummaxdevices: 20
|
||||||
|
- devicematch: nvidia-gpu
|
||||||
|
nummaxdevices: 20
|
48
smarter-device-manager-k3s-no-configmap.yaml
Normal file
48
smarter-device-manager-k3s-no-configmap.yaml
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-management
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||||
|
# scheduler reserves resources for critical add-on pods so that they can
|
||||||
|
# be rescheduled after a failure.
|
||||||
|
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
hostNetwork: true
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
hostname: smarter-device-management
|
||||||
|
nodeName: smarter-jetson-xavier-4bcc2584
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-manager
|
||||||
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 10Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 10Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
terminationGracePeriodSeconds: 30
|
75
smarter-device-manager-k3s-with-configmap-rpi.yaml
Normal file
75
smarter-device-manager-k3s-with-configmap-rpi.yaml
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: < Replace with the namespace to use >
|
||||||
|
labels:
|
||||||
|
name: < Replace with the namespace to use >
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-manager
|
||||||
|
namespace: < Replace with the namespace to use >
|
||||||
|
labels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
role: agent
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
annotations:
|
||||||
|
node.kubernetes.io/bootstrap-checkpoint: "true"
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
smarter-device-manager : enabled
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
hostname: smarter-device-management
|
||||||
|
hostNetwork: true
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-manager
|
||||||
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 15Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 15Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
mountPath: /dev
|
||||||
|
- name: config
|
||||||
|
mountPath: /root/config
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: smarter-device-manager-rpi
|
||||||
|
- name: config
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||||
|
terminationGracePeriodSeconds: 30
|
@ -36,7 +36,7 @@ spec:
|
|||||||
- name: k8sedgeregcred
|
- name: k8sedgeregcred
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
@ -54,6 +54,8 @@ spec:
|
|||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
mountPath: /dev
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
- name: config
|
- name: config
|
||||||
mountPath: /root/config
|
mountPath: /root/config
|
||||||
volumes:
|
volumes:
|
||||||
@ -63,6 +65,9 @@ spec:
|
|||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /dev
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
- name: config
|
- name: config
|
||||||
configMap:
|
configMap:
|
||||||
name: smarter-device-manager-rpi
|
name: smarter-device-manager-rpi
|
@ -32,11 +32,9 @@ spec:
|
|||||||
hostname: smarter-device-management
|
hostname: smarter-device-management
|
||||||
hostNetwork: true
|
hostNetwork: true
|
||||||
dnsPolicy: ClusterFirstWithHostNet
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
imagePullSecrets:
|
|
||||||
- name: k8sedgeregcred
|
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
@ -54,6 +52,8 @@ spec:
|
|||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
mountPath: /dev
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
hostPath:
|
hostPath:
|
||||||
@ -61,4 +61,7 @@ spec:
|
|||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /dev
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
terminationGracePeriodSeconds: 30
|
terminationGracePeriodSeconds: 30
|
||||||
|
@ -36,7 +36,7 @@ spec:
|
|||||||
- name: k8sedgeregcred
|
- name: k8sedgeregcred
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
@ -54,6 +54,8 @@ spec:
|
|||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
mountPath: /dev
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
- name: config
|
- name: config
|
||||||
mountPath: /root/config
|
mountPath: /root/config
|
||||||
volumes:
|
volumes:
|
||||||
@ -63,6 +65,9 @@ spec:
|
|||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /dev
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
- name: config
|
- name: config
|
||||||
configMap:
|
configMap:
|
||||||
name: smarter-device-manager-rpi
|
name: smarter-device-manager-rpi
|
74
smarter-device-manager-k8s-with-configmap-xavier.yaml
Normal file
74
smarter-device-manager-k8s-with-configmap-xavier.yaml
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: < Replace with the namespace to use >
|
||||||
|
labels:
|
||||||
|
name: < Replace with the namespace to use >
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: smarter-device-manager
|
||||||
|
namespace: < Replace with the namespace to use >
|
||||||
|
labels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
role: agent
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: smarter-device-manager
|
||||||
|
annotations:
|
||||||
|
node.kubernetes.io/bootstrap-checkpoint: "true"
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
smarter-device-manager : enabled
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
hostname: smarter-device-management
|
||||||
|
hostNetwork: true
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: k8sedgeregcred
|
||||||
|
containers:
|
||||||
|
- name: smarter-device-manager
|
||||||
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 15Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 15Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
|
- name: config
|
||||||
|
mountPath: /root/config
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev-dir
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: smarter-device-manager-xavier
|
||||||
|
terminationGracePeriodSeconds: 30
|
@ -36,7 +36,7 @@ spec:
|
|||||||
- name: k8sedgeregcred
|
- name: k8sedgeregcred
|
||||||
containers:
|
containers:
|
||||||
- name: smarter-device-manager
|
- name: smarter-device-manager
|
||||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
@ -54,6 +54,8 @@ spec:
|
|||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
mountPath: /dev
|
mountPath: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
mountPath: /sys
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
hostPath:
|
hostPath:
|
||||||
@ -61,4 +63,7 @@ spec:
|
|||||||
- name: dev-dir
|
- name: dev-dir
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /dev
|
path: /dev
|
||||||
|
- name: sys-dir
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
terminationGracePeriodSeconds: 30
|
terminationGracePeriodSeconds: 30
|
||||||
|
Loading…
Reference in New Issue
Block a user