20 Commits

Author SHA1 Message Date
Alexandre Ferreira
7eb7526956 Merge branch 'dev' into 'master'
Dev

See merge request arm-research/smarter/smarter-device-manager!13
2021-01-06 01:35:33 +00:00
Alexandre Ferreira
304807e48e Fix removal of sockets files on shhutdown 2021-01-05 18:43:56 -06:00
Alexandre Ferreira
879085aaed Fix for k3s >= 1.18 2020-06-24 15:16:01 -05:00
Alexandre Ferreira
43009d99cc Merge branch 'update-registry' into 'master'
Update registry with the latest image

See merge request arm-research/smarter/smarter-device-manager!12
2020-06-10 20:36:22 +00:00
Alexandre Ferreira
ee5804f7b8 Update registry with the latest image 2020-06-10 15:26:28 -05:00
Alexandre Ferreira
7a3353216d Merge branch 'k3s-1.18' into 'master'
Add tty to the configmaps and rename yaml files to correspond to uses and README

See merge request arm-research/smarter/smarter-device-manager!11
2020-06-10 16:06:27 +00:00
Alexandre Ferreira
0eab2f36e6 Add tty to the configmaps and rename yaml files to correspond to uses and README 2020-06-10 10:36:18 -05:00
Alexandre Ferreira
c7605e87d8 Merge branch 'add-nvidia-conf' into 'master'
Update conf.yaml to support nvidia-gpu

See merge request arm-research/smarter/smarter-device-manager!10
2020-06-09 23:29:16 +00:00
Alexandre Ferreira
21960c3eda Update conf.yaml to support nvidia-gpu 2020-06-09 18:22:16 -05:00
Alexandre Ferreira
2e68d7793d Merge branch 'add-nvidiagpu' into 'master'
Add nvidiagpu

See merge request arm-research/smarter/smarter-device-manager!9
2020-06-09 21:30:17 +00:00
Alexandre Ferreira
633128015b Add support for nvidia-gpu 2020-06-09 16:15:51 -05:00
Alexandre Ferreira
06a5d1129c Forgot the nvidia portion of the server 2020-06-08 14:53:33 -05:00
Alexandre Ferreira
727594c382 New version of the nvidia GPU access 2020-06-08 14:45:14 -05:00
Alexandre Ferreira
94783dfc37 WIP: for adding nvidia-gpu as a device 2020-06-04 14:11:13 -05:00
Alexandre Ferreira
f1b720f53e Merge branch 'feature/add-serial' into 'master'
Add common serial port incantations

See merge request arm-research/smarter/smarter-device-manager!6
2020-06-02 17:37:25 +00:00
Alexandre Ferreira
acb56cef95 Merge branch 'alex' into 'master'
Add configmap to the daemonsets

See merge request arm-research/smarter/smarter-device-manager!7
2020-06-02 17:19:18 +00:00
Alexandre Ferreira
94e5449a72 Add configmap to the daemonsets 2020-06-02 12:13:09 -05:00
Eric Van Hensbergen
8a81ecce8b Add common serial port incantations
Signed-off-by: Eric Van Hensbergen <eric.vanhensbergen@arm.com>
2020-06-02 11:27:08 -05:00
Alexandre Ferreira
7e976ae90c Merge branch 'simplify-ci' into 'master'
Simplify CI

See merge request arm-research/smarter/smarter-device-manager!5
2020-05-11 17:23:01 +00:00
Eric Van Hensbergen
377f5fb26e Simplify CI
Use the template instead of duplicating it.

Signed-off-by: Eric Van Hensbergen <eric.vanhensbergen@arm.com>
2020-05-10 00:02:40 +00:00
20 changed files with 963 additions and 105 deletions

View File

@@ -1,57 +1,6 @@
include:
- project: 'ericvh/gitlab-ci-arm-template'
file: '/.gitlab-ci.yml'
variables:
CI_BUILD_IMAGE: "registry.gitlab.com/ericvh/docker-buildx-qemu"
CI_BUILDX_ARCHS: "linux/amd64,linux/arm64,linux/arm"
.build:
image: $CI_BUILD_IMAGE
stage: build
services:
- name: docker:dind
entrypoint: ["env", "-u", "DOCKER_HOST"]
command: ["dockerd-entrypoint.sh"]
variables:
DOCKER_HOST: tcp://docker:2375/
DOCKER_DRIVER: overlay2
# See https://github.com/docker-library/docker/pull/166
DOCKER_TLS_CERTDIR: ""
retry: 2
before_script:
- |
if [[ -z "$CI_COMMIT_TAG" ]]; then
export CI_APPLICATION_REPOSITORY=${CI_APPLICATION_REPOSITORY:-$CI_REGISTRY_IMAGE/$CI_COMMIT_REF_SLUG}
export CI_APPLICATION_TAG=${CI_APPLICATION_TAG:-$CI_COMMIT_SHA}
else
export CI_APPLICATION_REPOSITORY=${CI_APPLICATION_REPOSITORY:-$CI_REGISTRY_IMAGE}
export CI_APPLICATION_TAG=${CI_APPLICATION_TAG:-$CI_COMMIT_TAG}
fi
- echo "$CI_REGISTRY_PASSWORD" | docker login -u "$CI_REGISTRY_USER" --password-stdin $CI_REGISTRY
build:buildx-master:
extends: .build
only:
refs:
- master
variables:
- $CI_BUILDX_ARCHS
script:
# Use docker-container driver to allow useful features (push/multi-platform)
- update-binfmts --enable # Important: Ensures execution of other binary formats is enabled in the kernel
- docker buildx create --driver docker-container --use
- docker buildx inspect --bootstrap
- docker buildx ls
- docker buildx build --platform $CI_BUILDX_ARCHS --progress plain --pull -t "$CI_REGISTRY_IMAGE" --push .
build:buildx:
extends: .build
only:
variables:
- $CI_BUILDX_ARCHS
except:
refs:
- master
script:
- update-binfmts --enable # Important: Ensures execution of other binary formats is enabled in the kernel
- docker buildx create --driver docker-container --use
- docker buildx inspect --bootstrap
- docker buildx ls
- docker buildx build --platform $CI_BUILDX_ARCHS --progress plain --pull -t "$CI_APPLICATION_REPOSITORY:$CI_APPLICATION_TAG" --push .

View File

@@ -147,3 +147,6 @@ The following command should show the node resources in a similar form as shown
kubectl describe node pike5
```
## k3s
K3s < 1.18 stores the plugin interface in a different directory than k8s and so it needs a different yaml file to enable smarter-device-manager to communicate correctly with k3s agent. So use the smart-device-manager-k3s yaml files on this reposistor for k3s < 1.18.

64
client-alsa.yaml.template Normal file
View File

@@ -0,0 +1,64 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -d /dev/snd ]
then
echo "No sound directory available (/dev/snd)"
exit 1
fi
apk add alsa-utils
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
for i in 1 2 3 4 5 6 7 8 9 10
do
sleep 20
done
exit $?
fi
if [ $? -gt 0 ]
then
echo "Could not install alsa-utils"
exit $?
fi
RESULT=$(aplay -L)
if [ $? -gt 0 ]
then
echo "Could not execute aplay"
exit $?
fi
NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l)
if [ ${NL} -ne 2 ]
then
echo "Aplay did not find the correct device check:"
echo "${RESULT}"
exit 11
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
requests:
cpu: 100m
memory: 100Mi
smarter-devices/snd: 1
terminationGracePeriodSeconds: 10

View File

@@ -0,0 +1,36 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management-nvidia-client
namespace: NAMESPACE
spec:
serviceAccountName: default
automountServiceAccountToken: false
dnsPolicy: ClusterFirstWithHostNet
hostname: yocto-test-client
nodeName: NODE_TO_TEST
restartPolicy: Never
containers:
- name: smarter-device-management-nvidia-client
imagePullPolicy: IfNotPresent
image: alpine
command: ["/bin/ash"]
args:
- "-c"
- |
if [ ! -e /dev/nvhost-gpu ]
then
echo "No nvidia GPU available (/dev/nvhost-gpu)"
exit 1
fi
exit 0
resources:
limits:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
requests:
cpu: 100m
memory: 100Mi
smarter-devices/nvidia-gpu0: 0
terminationGracePeriodSeconds: 10

View File

@@ -5,6 +5,12 @@
function printHelp() {
echo $(basename $0)" options:";
echo " -A <Architectures to use> # Compiling to ${ARCHS} now, examples: linux/amd64,linux/arm/v7,linux/arm/v6,linux/arm64"
if [ ${FLAG_NOCACHE} -gt 0 ]
then
echo " -C # Do not use cache"
else
echo " -C # Use cache"
fi
if [ ${FLAG_UPLOADIMAGES} -gt 0 ]
then
echo " -U # Do not upload images - the default is upload the images to the registry"
@@ -34,17 +40,18 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager
IMAGE_NAME="smarter-device-manager"
DIRECTORY_TO_RUN=.
ARCHS="linux/arm/v7"
ARCHS="linux/arm64"
# Variable defaults
FLAG_UPLOADIMAGES=1
FLAG_UPLOADIMAGES=0
FLAG_USESQUASH=0
FLAG_UPLOADMANIFEST=1
ADDITIONAL_TAG=""
ADDITIONAL_IMAGE_NAME=""
PUSH_OPTION=""
FLAG_NOCACHE=0
while getopts hA:B:MST:U name
while getopts hA:B:MST:UC name
do
case $name in
h)
@@ -52,6 +59,10 @@ do
exit 0;;
A)
ARCHS="$OPTARG";;
C)
[ ${FLAG_NOCACHE} -gt 0 ] && FLAG_NOCACHE=0;
[ ${FLAG_NOCACHE} -eq 0 ] && FLAG_NOCACHE=1;
;;
U)
[ ${FLAG_UPLOADIMAGES} -gt 0 ] && FLAG_UPLOADIMAGES=0;
[ ${FLAG_UPLOADIMAGES} -eq 0 ] && FLAG_UPLOADIMAGES=1;
@@ -93,11 +104,20 @@ EOF
fi
fi
if [ $FLAG_NOCACHE -gt 0 ]
then
CACHE_OPTION="--no-cache"
else
CACHE_OPTION=""
fi
if [ $FLAG_UPLOADIMAGES -gt 0 ]
then
PUSH_OPTION="--push"
else
PUSH_OPTION="--load"
fi
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .
docker buildx build ${CACHE_OPTION} -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .
exit 0

View File

@@ -15,4 +15,14 @@
- devicematch: ^vchiq$
nummaxdevices: 20
- devicematch: ^vcsm.*$
nummaxdevices: 20
nummaxdevices: 20
- devicematch: ^ttyUSB[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyACM[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyTHS[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyS[0-9]*$
nummaxdevices: 1
- devicematch: nvidia-gpu
nummaxdevices: 10

136
main.go
View File

@@ -5,6 +5,7 @@ package main
import (
"flag"
"fmt"
"strings"
"os"
"regexp"
"syscall"
@@ -18,13 +19,21 @@ import (
var confFileName string
const (
deviceFileType uint = 0
nvidiaSysType uint = 1
)
type DeviceInstance struct {
devicePlugin *SmarterDevicePlugin
devicePluginSmarter *SmarterDevicePlugin
devicePluginNvidia *NvidiaDevicePlugin
deviceName string
socketName string
deviceFile string
numDevices uint
deviceType uint
deviceId string
}
type DesiredDevice struct {
@@ -46,8 +55,8 @@ func init() {
flag.Parse()
}
func readDevDirectory() (files []string, err error) {
f, err := os.Open("/dev")
func readDevDirectory(dirToList string) (files []string, err error) {
f, err := os.Open(dirToList)
if err != nil {
return nil, err
}
@@ -93,34 +102,65 @@ func main() {
}
glog.V(0).Info("Reading existing devices on /dev")
ExistingDevices, err := readDevDirectory()
ExistingDevices, err := readDevDirectory("/dev")
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
ExistingDevicesSys, err := readDevDirectory("/sys/devices")
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
var listDevicesAvailable []DeviceInstance
for _, deviceToTest := range desiredDevices {
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch)
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch)
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
if deviceToTest.DeviceMatch == "nvidia-gpu" {
glog.V(0).Infof("Checking nvidia devices")
foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*")
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
// If found some create the devices entry
if len(foundDevices) > 0 {
for _, deviceToCreate := range foundDevices {
var newDevice DeviceInstance
newDevice.deviceName = "smarter-devices/" + deviceToCreate
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock"
newDevice.deviceFile = "/dev/" + deviceToCreate
newDevice.numDevices = deviceToTest.NumMaxDevices
listDevicesAvailable = append(listDevicesAvailable, newDevice)
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
}
}
// If found some create the devices entry
if len(foundDevices) > 0 {
for _, deviceToCreate := range foundDevices {
var newDevice DeviceInstance
deviceId := strings.TrimPrefix(deviceToCreate,"gpu.")
newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId
newDevice.deviceId = deviceId
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock"
newDevice.deviceFile = deviceId
newDevice.numDevices = deviceToTest.NumMaxDevices
newDevice.deviceType = nvidiaSysType
listDevicesAvailable = append(listDevicesAvailable, newDevice)
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
}
}
} else {
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch)
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch)
if err != nil {
glog.Errorf(err.Error())
os.Exit(1)
}
// If found some create the devices entry
if len(foundDevices) > 0 {
for _, deviceToCreate := range foundDevices {
var newDevice DeviceInstance
newDevice.deviceType = deviceFileType
newDevice.deviceName = "smarter-devices/" + deviceToCreate
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock"
newDevice.deviceFile = "/dev/" + deviceToCreate
newDevice.numDevices = deviceToTest.NumMaxDevices
listDevicesAvailable = append(listDevicesAvailable, newDevice)
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
}
}
}
}
glog.V(0).Info("Starting FS watcher.")
@@ -140,18 +180,34 @@ L:
for {
if restart {
for _, devicesInUse := range listDevicesAvailable {
if devicesInUse.devicePlugin != nil {
devicesInUse.devicePlugin.Stop()
}
switch devicesInUse.deviceType {
case deviceFileType :
if devicesInUse.devicePluginSmarter != nil {
devicesInUse.devicePluginSmarter.Stop()
}
case nvidiaSysType :
if devicesInUse.devicePluginNvidia != nil {
devicesInUse.devicePluginNvidia.Stop()
}
}
}
var err error
for _, devicesInUse := range listDevicesAvailable {
devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName)
if err = devicesInUse.devicePlugin.Serve(); err != nil {
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break
}
for id, _ := range listDevicesAvailable {
switch listDevicesAvailable[id].deviceType {
case deviceFileType :
listDevicesAvailable[id].devicePluginSmarter = NewSmarterDevicePlugin(listDevicesAvailable[id].numDevices, listDevicesAvailable[id].deviceFile, listDevicesAvailable[id].deviceName, listDevicesAvailable[id].socketName)
if err = listDevicesAvailable[id].devicePluginSmarter.Serve(); err != nil {
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break
}
case nvidiaSysType :
listDevicesAvailable[id].devicePluginNvidia = NewNvidiaDevicePlugin(listDevicesAvailable[id].numDevices, listDevicesAvailable[id].deviceName,"NVIDIA_VISIBLE_DEVICES", listDevicesAvailable[id].socketName, listDevicesAvailable[id].deviceId)
if err = listDevicesAvailable[id].devicePluginNvidia.Serve(); err != nil {
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
break
}
}
}
if err != nil {
continue
@@ -178,9 +234,21 @@ L:
default:
glog.V(0).Infof("Received signal \"%v\", shutting down.", s)
for _, devicesInUse := range listDevicesAvailable {
if devicesInUse.devicePlugin != nil {
devicesInUse.devicePlugin.Stop()
}
glog.V(0).Info("Stopping device ", devicesInUse.deviceName)
switch devicesInUse.deviceType {
case deviceFileType :
glog.V(0).Info("Smarter device type")
if devicesInUse.devicePluginSmarter != nil {
glog.V(0).Info("Stopping device")
devicesInUse.devicePluginSmarter.Stop()
}
case nvidiaSysType :
glog.V(0).Info("Nvidia device type")
if devicesInUse.devicePluginNvidia != nil {
glog.V(0).Info("Stopping device")
devicesInUse.devicePluginNvidia.Stop()
}
}
}
break L
}

276
nvidia-server.go Normal file
View File

@@ -0,0 +1,276 @@
// Copyright (c) 2019, Arm Ltd
package main
import (
"flag"
"net"
"os"
"path"
"strings"
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)
var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of DeviceSpecs to the kubelet on Allocate()")
// NvidiaDevicePlugin implements the Kubernetes device plugin API
type NvidiaDevicePlugin struct {
devs []*pluginapi.Device
socket string
resourceName string
allocateEnvvar string
id string
stop chan interface{}
health chan *pluginapi.Device
server *grpc.Server
}
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
return &NvidiaDevicePlugin{
devs: getDevices(nDevices),
resourceName: resourceName,
allocateEnvvar: allocateEnvvar,
socket: socket,
id: id,
stop: make(chan interface{}),
health: make(chan *pluginapi.Device),
}
}
// dial establishes the gRPC communication with the registered device plugin.
func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(timeout),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, err
}
return c, nil
}
// Start the gRPC server of the device plugin
func (m *NvidiaDevicePlugin) Start() error {
glog.V(0).Info("Initializing nvidia device manager")
err := m.cleanup()
if err != nil {
return err
}
glog.V(0).Info("Opening nvidia device manager socket ", m.socket)
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
glog.V(0).Info("Socket opened nvidia device manager")
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
glog.V(0).Info("gRPC server registered")
go m.server.Serve(sock)
glog.V(0).Info("gRPC server running on socket")
// Wait for server to start by launching a blocking connexion
conn, err := dialNvidia(m.socket, 60*time.Second)
if err != nil {
return err
}
conn.Close()
glog.V(0).Info("gRPC Dial OK")
go m.healthcheck()
return nil
}
// Stop the gRPC server
func (m *NvidiaDevicePlugin) Stop() error {
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
close(m.stop)
return m.cleanup()
}
// Register the device plugin for the given resourceName with Kubelet.
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
conn, err := dialNvidia(kubeletEndpoint, 5*time.Second)
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: path.Base(m.socket),
ResourceName: resourceName,
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
// ListAndWatch lists devices and update that list according to the health status
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case d := <-m.health:
// FIXME: there is no way to recover from the Unhealthy state.
d.Health = pluginapi.Unhealthy
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
}
}
}
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
m.health <- dev
}
// Allocate which return list of devices.
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
//for _, id := range req.DevicesIDs {
// if !m.deviceExists(id) {
// return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id)
// }
//
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
m.allocateEnvvar: m.id,
},
}
if *passDeviceSpecs {
response.Devices = m.apiDeviceSpecs(req.DevicesIDs)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
return &responses, nil
}
func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
return &pluginapi.PreStartContainerResponse{}, nil
}
func (m *NvidiaDevicePlugin) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func (m *NvidiaDevicePlugin) healthcheck() {
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
if disableHealthChecks == "all" {
disableHealthChecks = allHealthChecks
}
_, cancel := context.WithCancel(context.Background())
var xids chan *pluginapi.Device
if !strings.Contains(disableHealthChecks, "xids") {
xids = make(chan *pluginapi.Device)
}
for {
select {
case <-m.stop:
cancel()
return
case dev := <-xids:
m.unhealthy(dev)
}
}
}
// Serve starts the gRPC server and register the device plugin to Kubelet
func (m *NvidiaDevicePlugin) Serve() error {
err := m.Start()
if err != nil {
glog.Errorf("Could not start device plugin: %s", err)
return err
}
glog.V(0).Info("Starting to serve on", m.socket)
err = m.Register(pluginapi.KubeletSocket, m.resourceName)
if err != nil {
glog.Errorf("Could not register device plugin: %s", err)
m.Stop()
return err
}
glog.V(0).Info("Registered device plugin with Kubelet")
return nil
}
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
return &pluginapi.DevicePluginOptions{}, nil
}
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
var specs []*pluginapi.DeviceSpec
paths := []string{
"/dev/nvidiactl",
"/dev/nvidia-uvm",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-modeset",
}
for _, p := range paths {
if _, err := os.Stat(p); err == nil {
spec := &pluginapi.DeviceSpec{
ContainerPath: p,
HostPath: p,
Permissions: "rw",
}
specs = append(specs, spec)
}
}
// for _, d := range m.devs {
// for _, id := range filter {
// if d.ID == id {
// spec := &pluginapi.DeviceSpec{
// ContainerPath: d.Path,
// HostPath: d.Path,
// Permissions: "rw",
// }
// specs = append(specs, spec)
// }
// }
// }
return specs
}

View File

@@ -37,7 +37,7 @@ type SmarterDevicePlugin struct {
// NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin
func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin {
return &SmarterDevicePlugin{
devs: getDevices(uint(10)),
devs: getDevices(nDevices),
socket: serverSock,
deviceFile: deviceFilename,
resourceName: resourceIdentification,
@@ -94,6 +94,7 @@ func (m *SmarterDevicePlugin) Start() error {
// Stop the gRPC server
func (m *SmarterDevicePlugin) Stop() error {
glog.V(0).Infof("Stopping server with socket ",m.socket)
if m.server == nil {
return nil
}
@@ -101,6 +102,7 @@ func (m *SmarterDevicePlugin) Stop() error {
m.server.Stop()
m.server = nil
close(m.stop)
glog.V(0).Info("Server stopped with socket ",m.socket)
return m.cleanup()
}
@@ -179,6 +181,7 @@ func (m *SmarterDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreS
}
func (m *SmarterDevicePlugin) cleanup() error {
glog.V(0).Info("Removing file ",m.socket)
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}

View File

@@ -0,0 +1,53 @@
apiVersion: v1
kind: Pod
metadata:
name: smarter-device-management
namespace: default
spec:
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: smarter-device-management
nodeName: smarter-jetson-xavier-4bcc2584
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run>
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
@@ -33,6 +33,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
@@ -40,4 +42,7 @@ spec:
- name: dev-dir
hostPath:
path: /dev
terminationGracePeriodSeconds: 30
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@@ -15,7 +15,7 @@ spec:
nodeName: <replace with node to run>
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
@@ -33,6 +33,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
@@ -40,4 +42,7 @@ spec:
- name: dev-dir
hostPath:
path: /dev
terminationGracePeriodSeconds: 30
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@@ -0,0 +1,32 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: smarter-device-manager-rpi
data:
conf.yaml: |
- devicematch: ^snd$
nummaxdevices: 20
- devicematch: ^gpiomem$
nummaxdevices: 40
- devicematch: ^gpiochip[0-9]*$
nummaxdevices: 20
- devicematch: ^hci[0-9]*$
nummaxdevices: 1
- devicematch: ^i2c-[0-9]*$
nummaxdevices: 1
- devicematch: ^rtc0$
nummaxdevices: 20
- devicematch: ^video[0-9]*$
nummaxdevices: 20
- devicematch: ^vchiq$
nummaxdevices: 20
- devicematch: ^vcsm.*$
nummaxdevices: 20
- devicematch: ^ttyUSB[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyACM[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyTHS[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyS[0-9]*$
nummaxdevices: 1

View File

@@ -0,0 +1,34 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: smarter-device-manager-xavier
data:
conf.yaml: |
- devicematch: ^snd$
nummaxdevices: 20
- devicematch: ^gpiomem$
nummaxdevices: 40
- devicematch: ^gpiochip[0-9]*$
nummaxdevices: 20
- devicematch: ^hci[0-9]*$
nummaxdevices: 1
- devicematch: ^i2c-[0-9]*$
nummaxdevices: 1
- devicematch: ^rtc0$
nummaxdevices: 20
- devicematch: ^video[0-9]*$
nummaxdevices: 20
- devicematch: ^vchiq$
nummaxdevices: 20
- devicematch: ^vcsm.*$
nummaxdevices: 20
- devicematch: ^ttyUSB[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyACM[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyTHS[0-9]*$
nummaxdevices: 1
- devicematch: ^ttyS[0-9]*$
nummaxdevices: 1
- devicematch: nvidia-gpu
nummaxdevices: 20

View File

@@ -32,11 +32,9 @@ spec:
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
@@ -54,6 +52,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
@@ -61,4 +61,7 @@ spec:
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30

View File

@@ -0,0 +1,75 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: config
mountPath: /root/config
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-rpi
- name: config
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
terminationGracePeriodSeconds: 30

View File

@@ -0,0 +1,72 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-rpi
terminationGracePeriodSeconds: 30

View File

@@ -0,0 +1,75 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-rpi
- name: config
hostPath:
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
terminationGracePeriodSeconds: 30

View File

@@ -0,0 +1,72 @@
apiVersion: v1
kind: Namespace
metadata:
name: < Replace with the namespace to use >
labels:
name: < Replace with the namespace to use >
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: smarter-device-manager
namespace: < Replace with the namespace to use >
labels:
name: smarter-device-manager
role: agent
spec:
selector:
matchLabels:
name: smarter-device-manager
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: smarter-device-manager
annotations:
node.kubernetes.io/bootstrap-checkpoint: "true"
spec:
nodeSelector:
smarter-device-manager : enabled
priorityClassName: "system-node-critical"
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
limits:
cpu: 100m
memory: 15Mi
requests:
cpu: 10m
memory: 15Mi
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
- name: config
mountPath: /root/config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
- name: config
configMap:
name: smarter-device-manager-xavier
terminationGracePeriodSeconds: 30

View File

@@ -32,11 +32,9 @@ spec:
hostname: smarter-device-management
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
imagePullSecrets:
- name: k8sedgeregcred
containers:
- name: smarter-device-manager
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager:v1.1.2
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
@@ -54,6 +52,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins
- name: dev-dir
mountPath: /dev
- name: sys-dir
mountPath: /sys
volumes:
- name: device-plugin
hostPath:
@@ -61,4 +61,7 @@ spec:
- name: dev-dir
hostPath:
path: /dev
- name: sys-dir
hostPath:
path: /sys
terminationGracePeriodSeconds: 30