mirror of
https://gitlab.com/arm-research/smarter/smarter-device-manager.git
synced 2024-12-25 17:26:15 +00:00
Merge branch 'add-nvidiagpu' into 'master'
Add nvidiagpu See merge request arm-research/smarter/smarter-device-manager!9
This commit is contained in:
commit
2e68d7793d
64
client-alsa.yaml.template
Normal file
64
client-alsa.yaml.template
Normal file
@ -0,0 +1,64 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: smarter-device-management-client
|
||||
namespace: NAMESPACE
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
automountServiceAccountToken: false
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostname: yocto-test-client
|
||||
nodeName: NODE_TO_TEST
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: smarter-device-management-client
|
||||
imagePullPolicy: IfNotPresent
|
||||
image: alpine
|
||||
command: ["/bin/ash"]
|
||||
args:
|
||||
- "-c"
|
||||
- |
|
||||
if [ ! -d /dev/snd ]
|
||||
then
|
||||
echo "No sound directory available (/dev/snd)"
|
||||
exit 1
|
||||
fi
|
||||
apk add alsa-utils
|
||||
if [ $? -gt 0 ]
|
||||
then
|
||||
echo "Could not install alsa-utils"
|
||||
for i in 1 2 3 4 5 6 7 8 9 10
|
||||
do
|
||||
sleep 20
|
||||
done
|
||||
exit $?
|
||||
fi
|
||||
if [ $? -gt 0 ]
|
||||
then
|
||||
echo "Could not install alsa-utils"
|
||||
exit $?
|
||||
fi
|
||||
RESULT=$(aplay -L)
|
||||
if [ $? -gt 0 ]
|
||||
then
|
||||
echo "Could not execute aplay"
|
||||
exit $?
|
||||
fi
|
||||
NL=$(echo "${RESULT}" | grep tegrasndt19xmob | wc -l)
|
||||
if [ ${NL} -ne 2 ]
|
||||
then
|
||||
echo "Aplay did not find the correct device check:"
|
||||
echo "${RESULT}"
|
||||
exit 11
|
||||
fi
|
||||
exit 0
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
smarter-devices/snd: 1
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
smarter-devices/snd: 1
|
||||
terminationGracePeriodSeconds: 10
|
36
client-nvidia.yaml.template
Normal file
36
client-nvidia.yaml.template
Normal file
@ -0,0 +1,36 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: smarter-device-management-nvidia-client
|
||||
namespace: NAMESPACE
|
||||
spec:
|
||||
serviceAccountName: default
|
||||
automountServiceAccountToken: false
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostname: yocto-test-client
|
||||
nodeName: NODE_TO_TEST
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: smarter-device-management-nvidia-client
|
||||
imagePullPolicy: IfNotPresent
|
||||
image: alpine
|
||||
command: ["/bin/ash"]
|
||||
args:
|
||||
- "-c"
|
||||
- |
|
||||
if [ ! -e /dev/nvhost-gpu ]
|
||||
then
|
||||
echo "No nvidia GPU available (/dev/nvhost-gpu)"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
smarter-devices/nvidia-gpu0: 0
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
smarter-devices/nvidia-gpu0: 0
|
||||
terminationGracePeriodSeconds: 10
|
@ -34,10 +34,10 @@ REPOSITORY_NAME="registry.gitlab.com/arm-research/smarter/smarter-device-manager
|
||||
IMAGE_NAME="smarter-device-manager"
|
||||
DIRECTORY_TO_RUN=.
|
||||
|
||||
ARCHS="linux/arm/v7"
|
||||
ARCHS="linux/arm64"
|
||||
|
||||
# Variable defaults
|
||||
FLAG_UPLOADIMAGES=1
|
||||
FLAG_UPLOADIMAGES=0
|
||||
FLAG_USESQUASH=0
|
||||
FLAG_UPLOADMANIFEST=1
|
||||
ADDITIONAL_TAG=""
|
||||
@ -96,6 +96,8 @@ fi
|
||||
if [ $FLAG_UPLOADIMAGES -gt 0 ]
|
||||
then
|
||||
PUSH_OPTION="--push"
|
||||
else
|
||||
PUSH_OPTION="--load"
|
||||
fi
|
||||
|
||||
docker buildx build -t "${REPOSITORY_NAME}${IMAGE_NAME}${ADDITIONAL_IMAGE_NAME}:${BUILD_TAG}" --platform=${ARCHS} ${PUSH_OPTION} .
|
||||
|
@ -23,4 +23,4 @@
|
||||
- devicematch: ^ttyTHS[0-9]*$
|
||||
nummaxdevices: 1
|
||||
- devicematch: ^ttyS[0-9]*$
|
||||
nummaxdevices: 1
|
||||
nummaxdevices: 1
|
||||
|
129
main.go
129
main.go
@ -5,6 +5,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"os"
|
||||
"regexp"
|
||||
"syscall"
|
||||
@ -18,13 +19,21 @@ import (
|
||||
|
||||
var confFileName string
|
||||
|
||||
const (
|
||||
deviceFileType uint = 0
|
||||
nvidiaSysType uint = 1
|
||||
)
|
||||
|
||||
type DeviceInstance struct {
|
||||
devicePlugin *SmarterDevicePlugin
|
||||
devicePluginSmarter *SmarterDevicePlugin
|
||||
devicePluginNvidia *NvidiaDevicePlugin
|
||||
|
||||
deviceName string
|
||||
socketName string
|
||||
deviceFile string
|
||||
numDevices uint
|
||||
deviceType uint
|
||||
deviceId string
|
||||
}
|
||||
|
||||
type DesiredDevice struct {
|
||||
@ -46,8 +55,8 @@ func init() {
|
||||
flag.Parse()
|
||||
}
|
||||
|
||||
func readDevDirectory() (files []string, err error) {
|
||||
f, err := os.Open("/dev")
|
||||
func readDevDirectory(dirToList string) (files []string, err error) {
|
||||
f, err := os.Open(dirToList)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -93,34 +102,65 @@ func main() {
|
||||
}
|
||||
|
||||
glog.V(0).Info("Reading existing devices on /dev")
|
||||
ExistingDevices, err := readDevDirectory()
|
||||
ExistingDevices, err := readDevDirectory("/dev")
|
||||
if err != nil {
|
||||
glog.Errorf(err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ExistingDevicesSys, err := readDevDirectory("/sys/devices")
|
||||
if err != nil {
|
||||
glog.Errorf(err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
var listDevicesAvailable []DeviceInstance
|
||||
|
||||
for _, deviceToTest := range desiredDevices {
|
||||
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch)
|
||||
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch)
|
||||
if err != nil {
|
||||
glog.Errorf(err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
if deviceToTest.DeviceMatch == "nvidia-gpu" {
|
||||
glog.V(0).Infof("Checking nvidia devices")
|
||||
foundDevices,err := findDevicesPattern(ExistingDevicesSys, "gpu.[0-9]*")
|
||||
if err != nil {
|
||||
glog.Errorf(err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// If found some create the devices entry
|
||||
if len(foundDevices) > 0 {
|
||||
for _, deviceToCreate := range foundDevices {
|
||||
var newDevice DeviceInstance
|
||||
newDevice.deviceName = "smarter-devices/" + deviceToCreate
|
||||
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock"
|
||||
newDevice.deviceFile = "/dev/" + deviceToCreate
|
||||
newDevice.numDevices = deviceToTest.NumMaxDevices
|
||||
listDevicesAvailable = append(listDevicesAvailable, newDevice)
|
||||
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
|
||||
}
|
||||
}
|
||||
// If found some create the devices entry
|
||||
if len(foundDevices) > 0 {
|
||||
for _, deviceToCreate := range foundDevices {
|
||||
var newDevice DeviceInstance
|
||||
deviceId := strings.TrimPrefix(deviceToCreate,"gpu.")
|
||||
newDevice.deviceName = "smarter-devices/" + "nvidia-gpu" + deviceId
|
||||
newDevice.deviceId = deviceId
|
||||
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-nvidia-gpu" + deviceId + ".sock"
|
||||
newDevice.deviceFile = deviceId
|
||||
newDevice.numDevices = deviceToTest.NumMaxDevices
|
||||
newDevice.deviceType = nvidiaSysType
|
||||
listDevicesAvailable = append(listDevicesAvailable, newDevice)
|
||||
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
glog.V(0).Infof("Checking devices %s on /dev",deviceToTest.DeviceMatch)
|
||||
foundDevices,err := findDevicesPattern(ExistingDevices, deviceToTest.DeviceMatch)
|
||||
if err != nil {
|
||||
glog.Errorf(err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// If found some create the devices entry
|
||||
if len(foundDevices) > 0 {
|
||||
for _, deviceToCreate := range foundDevices {
|
||||
var newDevice DeviceInstance
|
||||
newDevice.deviceType = deviceFileType
|
||||
newDevice.deviceName = "smarter-devices/" + deviceToCreate
|
||||
newDevice.socketName = pluginapi.DevicePluginPath + "smarter-" + deviceToCreate + ".sock"
|
||||
newDevice.deviceFile = "/dev/" + deviceToCreate
|
||||
newDevice.numDevices = deviceToTest.NumMaxDevices
|
||||
listDevicesAvailable = append(listDevicesAvailable, newDevice)
|
||||
glog.V(0).Infof("Creating device %s socket and %s name for %s",newDevice.deviceName,newDevice.deviceFile,deviceToTest.DeviceMatch)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
glog.V(0).Info("Starting FS watcher.")
|
||||
@ -140,18 +180,34 @@ L:
|
||||
for {
|
||||
if restart {
|
||||
for _, devicesInUse := range listDevicesAvailable {
|
||||
if devicesInUse.devicePlugin != nil {
|
||||
devicesInUse.devicePlugin.Stop()
|
||||
}
|
||||
switch devicesInUse.deviceType {
|
||||
case deviceFileType :
|
||||
if devicesInUse.devicePluginSmarter != nil {
|
||||
devicesInUse.devicePluginSmarter.Stop()
|
||||
}
|
||||
case nvidiaSysType :
|
||||
if devicesInUse.devicePluginNvidia != nil {
|
||||
devicesInUse.devicePluginNvidia.Stop()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var err error
|
||||
for _, devicesInUse := range listDevicesAvailable {
|
||||
devicesInUse.devicePlugin = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName)
|
||||
if err = devicesInUse.devicePlugin.Serve(); err != nil {
|
||||
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
|
||||
break
|
||||
}
|
||||
switch devicesInUse.deviceType {
|
||||
case deviceFileType :
|
||||
devicesInUse.devicePluginSmarter = NewSmarterDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceFile, devicesInUse.deviceName, devicesInUse.socketName)
|
||||
if err = devicesInUse.devicePluginSmarter.Serve(); err != nil {
|
||||
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
|
||||
break
|
||||
}
|
||||
case nvidiaSysType :
|
||||
devicesInUse.devicePluginNvidia = NewNvidiaDevicePlugin(devicesInUse.numDevices, devicesInUse.deviceName,"NVIDIA_VISIBLE_DEVICES", devicesInUse.socketName, devicesInUse.deviceId)
|
||||
if err = devicesInUse.devicePluginNvidia.Serve(); err != nil {
|
||||
glog.V(0).Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
continue
|
||||
@ -178,9 +234,16 @@ L:
|
||||
default:
|
||||
glog.V(0).Infof("Received signal \"%v\", shutting down.", s)
|
||||
for _, devicesInUse := range listDevicesAvailable {
|
||||
if devicesInUse.devicePlugin != nil {
|
||||
devicesInUse.devicePlugin.Stop()
|
||||
}
|
||||
switch devicesInUse.deviceType {
|
||||
case deviceFileType :
|
||||
if devicesInUse.devicePluginSmarter != nil {
|
||||
devicesInUse.devicePluginSmarter.Stop()
|
||||
}
|
||||
case nvidiaSysType :
|
||||
if devicesInUse.devicePluginNvidia != nil {
|
||||
devicesInUse.devicePluginNvidia.Stop()
|
||||
}
|
||||
}
|
||||
}
|
||||
break L
|
||||
}
|
||||
|
276
nvidia-server.go
Normal file
276
nvidia-server.go
Normal file
@ -0,0 +1,276 @@
|
||||
// Copyright (c) 2019, Arm Ltd
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"net"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"golang.org/x/net/context"
|
||||
"google.golang.org/grpc"
|
||||
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
|
||||
)
|
||||
|
||||
var passDeviceSpecs = flag.Bool("pass-device-specs", false, "pass the list of DeviceSpecs to the kubelet on Allocate()")
|
||||
|
||||
// NvidiaDevicePlugin implements the Kubernetes device plugin API
|
||||
type NvidiaDevicePlugin struct {
|
||||
devs []*pluginapi.Device
|
||||
socket string
|
||||
resourceName string
|
||||
allocateEnvvar string
|
||||
id string
|
||||
|
||||
|
||||
stop chan interface{}
|
||||
health chan *pluginapi.Device
|
||||
|
||||
server *grpc.Server
|
||||
}
|
||||
|
||||
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
|
||||
func NewNvidiaDevicePlugin(nDevices uint, resourceName string, allocateEnvvar string, socket string, id string) *NvidiaDevicePlugin {
|
||||
return &NvidiaDevicePlugin{
|
||||
devs: getDevices(nDevices),
|
||||
resourceName: resourceName,
|
||||
allocateEnvvar: allocateEnvvar,
|
||||
socket: socket,
|
||||
id: id,
|
||||
|
||||
stop: make(chan interface{}),
|
||||
health: make(chan *pluginapi.Device),
|
||||
}
|
||||
}
|
||||
|
||||
// dial establishes the gRPC communication with the registered device plugin.
|
||||
func dialNvidia(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
|
||||
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
grpc.WithTimeout(timeout),
|
||||
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
|
||||
return net.DialTimeout("unix", addr, timeout)
|
||||
}),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Start the gRPC server of the device plugin
|
||||
func (m *NvidiaDevicePlugin) Start() error {
|
||||
glog.V(0).Info("Initializing nvidia device manager")
|
||||
err := m.cleanup()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
glog.V(0).Info("Opening nvidia device manager socket ", m.socket)
|
||||
sock, err := net.Listen("unix", m.socket)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
glog.V(0).Info("Socket opened nvidia device manager")
|
||||
|
||||
m.server = grpc.NewServer([]grpc.ServerOption{}...)
|
||||
pluginapi.RegisterDevicePluginServer(m.server, m)
|
||||
glog.V(0).Info("gRPC server registered")
|
||||
|
||||
go m.server.Serve(sock)
|
||||
glog.V(0).Info("gRPC server running on socket")
|
||||
|
||||
// Wait for server to start by launching a blocking connexion
|
||||
conn, err := dialNvidia(m.socket, 60*time.Second)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
conn.Close()
|
||||
glog.V(0).Info("gRPC Dial OK")
|
||||
|
||||
go m.healthcheck()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop the gRPC server
|
||||
func (m *NvidiaDevicePlugin) Stop() error {
|
||||
if m.server == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
m.server.Stop()
|
||||
m.server = nil
|
||||
close(m.stop)
|
||||
|
||||
return m.cleanup()
|
||||
}
|
||||
|
||||
// Register the device plugin for the given resourceName with Kubelet.
|
||||
func (m *NvidiaDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
|
||||
conn, err := dialNvidia(kubeletEndpoint, 5*time.Second)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
client := pluginapi.NewRegistrationClient(conn)
|
||||
reqt := &pluginapi.RegisterRequest{
|
||||
Version: pluginapi.Version,
|
||||
Endpoint: path.Base(m.socket),
|
||||
ResourceName: resourceName,
|
||||
}
|
||||
|
||||
_, err = client.Register(context.Background(), reqt)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListAndWatch lists devices and update that list according to the health status
|
||||
func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
|
||||
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-m.stop:
|
||||
return nil
|
||||
case d := <-m.health:
|
||||
// FIXME: there is no way to recover from the Unhealthy state.
|
||||
d.Health = pluginapi.Unhealthy
|
||||
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) unhealthy(dev *pluginapi.Device) {
|
||||
m.health <- dev
|
||||
}
|
||||
|
||||
// Allocate which return list of devices.
|
||||
func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
|
||||
responses := pluginapi.AllocateResponse{}
|
||||
for _, req := range reqs.ContainerRequests {
|
||||
//for _, id := range req.DevicesIDs {
|
||||
// if !m.deviceExists(id) {
|
||||
// return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", m.resourceName, id)
|
||||
// }
|
||||
//
|
||||
|
||||
response := pluginapi.ContainerAllocateResponse{
|
||||
Envs: map[string]string{
|
||||
m.allocateEnvvar: m.id,
|
||||
},
|
||||
}
|
||||
if *passDeviceSpecs {
|
||||
response.Devices = m.apiDeviceSpecs(req.DevicesIDs)
|
||||
}
|
||||
|
||||
responses.ContainerResponses = append(responses.ContainerResponses, &response)
|
||||
}
|
||||
|
||||
return &responses, nil
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
|
||||
return &pluginapi.PreStartContainerResponse{}, nil
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) cleanup() error {
|
||||
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) healthcheck() {
|
||||
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
|
||||
if disableHealthChecks == "all" {
|
||||
disableHealthChecks = allHealthChecks
|
||||
}
|
||||
|
||||
_, cancel := context.WithCancel(context.Background())
|
||||
|
||||
var xids chan *pluginapi.Device
|
||||
if !strings.Contains(disableHealthChecks, "xids") {
|
||||
xids = make(chan *pluginapi.Device)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-m.stop:
|
||||
cancel()
|
||||
return
|
||||
case dev := <-xids:
|
||||
m.unhealthy(dev)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Serve starts the gRPC server and register the device plugin to Kubelet
|
||||
func (m *NvidiaDevicePlugin) Serve() error {
|
||||
err := m.Start()
|
||||
if err != nil {
|
||||
glog.Errorf("Could not start device plugin: %s", err)
|
||||
return err
|
||||
}
|
||||
glog.V(0).Info("Starting to serve on", m.socket)
|
||||
|
||||
err = m.Register(pluginapi.KubeletSocket, m.resourceName)
|
||||
if err != nil {
|
||||
glog.Errorf("Could not register device plugin: %s", err)
|
||||
m.Stop()
|
||||
return err
|
||||
}
|
||||
glog.V(0).Info("Registered device plugin with Kubelet")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
|
||||
return &pluginapi.DevicePluginOptions{}, nil
|
||||
}
|
||||
|
||||
func (m *NvidiaDevicePlugin) apiDeviceSpecs(filter []string) []*pluginapi.DeviceSpec {
|
||||
var specs []*pluginapi.DeviceSpec
|
||||
|
||||
paths := []string{
|
||||
"/dev/nvidiactl",
|
||||
"/dev/nvidia-uvm",
|
||||
"/dev/nvidia-uvm-tools",
|
||||
"/dev/nvidia-modeset",
|
||||
}
|
||||
|
||||
for _, p := range paths {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
spec := &pluginapi.DeviceSpec{
|
||||
ContainerPath: p,
|
||||
HostPath: p,
|
||||
Permissions: "rw",
|
||||
}
|
||||
specs = append(specs, spec)
|
||||
}
|
||||
}
|
||||
|
||||
// for _, d := range m.devs {
|
||||
// for _, id := range filter {
|
||||
// if d.ID == id {
|
||||
// spec := &pluginapi.DeviceSpec{
|
||||
// ContainerPath: d.Path,
|
||||
// HostPath: d.Path,
|
||||
// Permissions: "rw",
|
||||
// }
|
||||
// specs = append(specs, spec)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
return specs
|
||||
}
|
@ -37,7 +37,7 @@ type SmarterDevicePlugin struct {
|
||||
// NewSmarterDevicePlugin returns an initialized SmarterDevicePlugin
|
||||
func NewSmarterDevicePlugin(nDevices uint, deviceFilename string, resourceIdentification string, serverSock string) *SmarterDevicePlugin {
|
||||
return &SmarterDevicePlugin{
|
||||
devs: getDevices(uint(10)),
|
||||
devs: getDevices(nDevices),
|
||||
socket: serverSock,
|
||||
deviceFile: deviceFilename,
|
||||
resourceName: resourceIdentification,
|
||||
|
53
smarter-device-management-pod-k3s-test-xavier.yaml
Normal file
53
smarter-device-management-pod-k3s-test-xavier.yaml
Normal file
@ -0,0 +1,53 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: smarter-device-management
|
||||
namespace: default
|
||||
spec:
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostname: smarter-device-management
|
||||
nodeName: smarter-jetson-xavier-4bcc2584
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 10Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 10Mi
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
- name: config
|
||||
mountPath: /root/config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: config
|
||||
configMap:
|
||||
name: smarter-device-manager-xavier
|
||||
terminationGracePeriodSeconds: 30
|
@ -15,7 +15,7 @@ spec:
|
||||
nodeName: <replace with node to run>
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
@ -33,6 +33,8 @@ spec:
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
@ -40,4 +42,7 @@ spec:
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
terminationGracePeriodSeconds: 30
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
terminationGracePeriodSeconds: 30
|
||||
|
@ -15,7 +15,7 @@ spec:
|
||||
nodeName: <replace with node to run>
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
|
26
smarter-device-manager-configmap-xavier.yaml
Normal file
26
smarter-device-manager-configmap-xavier.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: smarter-device-manager-xavier
|
||||
data:
|
||||
conf.yaml: |
|
||||
- devicematch: ^snd$
|
||||
nummaxdevices: 20
|
||||
- devicematch: ^gpiomem$
|
||||
nummaxdevices: 40
|
||||
- devicematch: ^gpiochip[0-9]*$
|
||||
nummaxdevices: 20
|
||||
- devicematch: ^hci[0-9]*$
|
||||
nummaxdevices: 1
|
||||
- devicematch: ^i2c-[0-9]*$
|
||||
nummaxdevices: 1
|
||||
- devicematch: ^rtc0$
|
||||
nummaxdevices: 20
|
||||
- devicematch: ^video[0-9]*$
|
||||
nummaxdevices: 20
|
||||
- devicematch: ^vchiq$
|
||||
nummaxdevices: 20
|
||||
- devicematch: ^vcsm.*$
|
||||
nummaxdevices: 20
|
||||
- devicematch: nvidia-gpu
|
||||
nummaxdevices: 20
|
48
smarter-device-manager-k3s-no-configmap.yaml
Normal file
48
smarter-device-manager-k3s-no-configmap.yaml
Normal file
@ -0,0 +1,48 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: smarter-device-management
|
||||
namespace: default
|
||||
spec:
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
hostname: smarter-device-management
|
||||
nodeName: smarter-jetson-xavier-4bcc2584
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 10Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 10Mi
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
terminationGracePeriodSeconds: 30
|
75
smarter-device-manager-k3s-with-configmap-rpi.yaml
Normal file
75
smarter-device-manager-k3s-with-configmap-rpi.yaml
Normal file
@ -0,0 +1,75 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: < Replace with the namespace to use >
|
||||
labels:
|
||||
name: < Replace with the namespace to use >
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: smarter-device-manager
|
||||
namespace: < Replace with the namespace to use >
|
||||
labels:
|
||||
name: smarter-device-manager
|
||||
role: agent
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: smarter-device-manager
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: smarter-device-manager
|
||||
annotations:
|
||||
node.kubernetes.io/bootstrap-checkpoint: "true"
|
||||
spec:
|
||||
nodeSelector:
|
||||
smarter-device-manager : enabled
|
||||
priorityClassName: "system-node-critical"
|
||||
hostname: smarter-device-management
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 15Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 15Mi
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: config
|
||||
mountPath: /root/config
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: config
|
||||
configMap:
|
||||
name: smarter-device-manager-rpi
|
||||
- name: config
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/agent/kubelet/device-plugins
|
||||
terminationGracePeriodSeconds: 30
|
@ -36,7 +36,7 @@ spec:
|
||||
- name: k8sedgeregcred
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
@ -54,6 +54,8 @@ spec:
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
- name: config
|
||||
mountPath: /root/config
|
||||
volumes:
|
||||
@ -63,6 +65,9 @@ spec:
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: config
|
||||
configMap:
|
||||
name: smarter-device-manager-rpi
|
@ -32,11 +32,9 @@ spec:
|
||||
hostname: smarter-device-management
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
imagePullSecrets:
|
||||
- name: k8sedgeregcred
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
@ -54,6 +52,8 @@ spec:
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
@ -61,4 +61,7 @@ spec:
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
terminationGracePeriodSeconds: 30
|
||||
|
@ -36,7 +36,7 @@ spec:
|
||||
- name: k8sedgeregcred
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
@ -54,6 +54,8 @@ spec:
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
- name: config
|
||||
mountPath: /root/config
|
||||
volumes:
|
||||
@ -63,6 +65,9 @@ spec:
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: config
|
||||
configMap:
|
||||
name: smarter-device-manager-rpi
|
74
smarter-device-manager-k8s-with-configmap-xavier.yaml
Normal file
74
smarter-device-manager-k8s-with-configmap-xavier.yaml
Normal file
@ -0,0 +1,74 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: < Replace with the namespace to use >
|
||||
labels:
|
||||
name: < Replace with the namespace to use >
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: smarter-device-manager
|
||||
namespace: < Replace with the namespace to use >
|
||||
labels:
|
||||
name: smarter-device-manager
|
||||
role: agent
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: smarter-device-manager
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: smarter-device-manager
|
||||
annotations:
|
||||
node.kubernetes.io/bootstrap-checkpoint: "true"
|
||||
spec:
|
||||
nodeSelector:
|
||||
smarter-device-manager : enabled
|
||||
priorityClassName: "system-node-critical"
|
||||
hostname: smarter-device-management
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
imagePullSecrets:
|
||||
- name: k8sedgeregcred
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 15Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 15Mi
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
- name: config
|
||||
mountPath: /root/config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: config
|
||||
configMap:
|
||||
name: smarter-device-manager-xavier
|
||||
terminationGracePeriodSeconds: 30
|
@ -36,7 +36,7 @@ spec:
|
||||
- name: k8sedgeregcred
|
||||
containers:
|
||||
- name: smarter-device-manager
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:20191204204613
|
||||
image: registry.gitlab.com/arm-research/smarter/smarter-device-manager/smarter-device-manager:IMAGE_ID
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
@ -54,6 +54,8 @@ spec:
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: dev-dir
|
||||
mountPath: /dev
|
||||
- name: sys-dir
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
@ -61,4 +63,7 @@ spec:
|
||||
- name: dev-dir
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: sys-dir
|
||||
hostPath:
|
||||
path: /sys
|
||||
terminationGracePeriodSeconds: 30
|
||||
|
Loading…
Reference in New Issue
Block a user