Skip to content

Commit

Permalink
Adding vfio manager
Browse files Browse the repository at this point in the history
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
  • Loading branch information
visheshtanksale committed May 3, 2024
1 parent 5281fde commit c2ab8a7
Show file tree
Hide file tree
Showing 23 changed files with 39,034 additions and 10 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type CommandLineFlags struct {
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
UseNodeFeatureAPI *bool `json:"useNodeFeatureAPI" yaml:"useNodeFeatureAPI"`
Mode *string `json:"mode" yaml:"mode"`
Plugin *PluginCommandLineFlags `json:"plugin,omitempty" yaml:"plugin,omitempty"`
GFD *GFDCommandLineFlags `json:"gfd,omitempty" yaml:"gfd,omitempty"`
}
Expand Down Expand Up @@ -128,6 +129,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.MOFEDEnabled, c, n)
case "use-node-feature-api":
updateFromCLIFlag(&f.UseNodeFeatureAPI, c, n)
case "mode":
updateFromCLIFlag(&f.Mode, c, n)
}
// Plugin specific flags
if f.Plugin == nil {
Expand Down
5 changes: 4 additions & 1 deletion api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ func TestMarshalFlags(t *testing.T) {
"failOnInitError": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null
"useNodeFeatureAPI": null,
"mode": null
}`,
},
{
Expand All @@ -179,6 +180,7 @@ func TestMarshalFlags(t *testing.T) {
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
"mode": null,
"gfd": {
"oneshot": null,
"noTimestamp": null,
Expand All @@ -202,6 +204,7 @@ func TestMarshalFlags(t *testing.T) {
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
"mode": null,
"gfd": {
"oneshot": null,
"noTimestamp": null,
Expand Down
11 changes: 11 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"os"
"path/filepath"
"slices"
"syscall"
"time"

Expand Down Expand Up @@ -99,6 +100,12 @@ func main() {
Usage: "Use NFD NodeFeature API to publish labels",
EnvVars: []string{"GFD_USE_NODE_FEATURE_API", "USE_NODE_FEATURE_API"},
},
&cli.StringFlag{
Name: "mode",
Value: "auto",
Usage: "Select GFD mode between 'auto','nvml','tegra' or 'vfio'",
EnvVars: []string{"MODE", "GFD_MODE"},
},
}

config.flags = append(config.flags, config.kubeClientConfig.Flags()...)
Expand All @@ -113,6 +120,10 @@ func main() {
}

func validateFlags(config *spec.Config) error {
validModes := []string{"auto", "nvml", "tegra", "vfio"}
if !slices.Contains(validModes, *config.Flags.Mode) {
return fmt.Errorf("%s invalid mode option must be 'auto','nvml','tegra' or 'vfio'", *config.Flags.Mode)
}
return nil
}

Expand Down
1 change: 1 addition & 0 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ func TestRunSleep(t *testing.T) {
CommandLineFlags: spec.CommandLineFlags{
MigStrategy: ptr("none"),
FailOnInitError: ptr(true),
Mode: ptr("auto"),
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(false),
OutputFile: ptr("./gfd-test-loop"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ spec:
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
{{- end }}
- name: GFD_MODE
value: "{{ .Values.gfdMode }}"
securityContext:
{{- include "gpu-feature-discovery.securityContext" . | nindent 10 }}
volumeMounts:
Expand Down
1 change: 1 addition & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
gfdMode: "auto"

nameOverride: ""
fullnameOverride: ""
Expand Down
39 changes: 30 additions & 9 deletions internal/resource/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (

// NewManager is a factory method that creates a resource Manager based on the specified config.
func NewManager(config *spec.Config) Manager {
return WithConfig(getManager(), config)
return WithConfig(getManager(*config.Flags.Mode), config)
}

// WithConfig modifies a manager depending on the specified config.
Expand All @@ -39,7 +39,30 @@ func WithConfig(manager Manager, config *spec.Config) Manager {
}

// getManager returns the resource manager depending on the system configuration.
func getManager() Manager {
func getManager(mode string) Manager {

resolved := resolveMode(mode)
switch resolved {
case "nvml":
klog.Info("Using NVML manager")
return NewNVMLManager()
case "tegra":
klog.Info("Using CUDA manager")
return NewCudaManager()
case "vfio":
klog.Info("Using Vfio manager")
return NewVfioManager()
}

klog.Warningf("Unsupported mode detected: %v using empty manager.", resolved)
return NewNullManager()
}

func resolveMode(mode string) string {
if mode != "" && mode != "auto" {
return mode
}

// logWithReason logs the output of the has* / is* checks from the info.Interface
logWithReason := func(f func() (bool, string), tag string) bool {
is, reason := f()
Expand All @@ -62,13 +85,11 @@ func getManager() Manager {
}

if hasNVML {
klog.Info("Using NVML manager")
return NewNVMLManager()
} else if isTegra {
klog.Info("Using CUDA manager")
return NewCudaManager()
return "nvml"
}

klog.Warning("No valid resources detected; using empty manager.")
return NewNullManager()
if isTegra {
return "tegra"
}
return mode
}
66 changes: 66 additions & 0 deletions internal/resource/sysfs-device.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package resource

import (
"fmt"

"github.com/NVIDIA/go-nvlib/pkg/nvpci"
)

type vfioDevice struct {
nvidiaPCIDevice *nvpci.NvidiaPCIDevice
}

// GetMigDevices returns the list of MIG devices configured on this device
func (d vfioDevice) GetMigDevices() ([]Device, error) {
return nil, nil
}

// GetCudaComputeCapability is not supported for GPU devices with vfio pci driver.
func (d vfioDevice) GetCudaComputeCapability() (int, int, error) {
return -1, -1, nil
}

// GetAttributes is only supported for MIG devices.
func (d vfioDevice) GetAttributes() (map[string]interface{}, error) {
return nil, fmt.Errorf("GetAttributes is not supported for non-MIG devices")
}

// GetDeviceHandleFromMigDeviceHandle is only supported for MIG devices
func (d vfioDevice) GetDeviceHandleFromMigDeviceHandle() (Device, error) {
return nil, fmt.Errorf("GetDeviceHandleFromMigDeviceHandle is not supported for non-MIG devices")
}

// GetName returns the device name / model.
func (d vfioDevice) GetName() (string, error) {
return d.nvidiaPCIDevice.DeviceName, nil
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d vfioDevice) GetTotalMemoryMB() (uint64, error) {
_, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true)
return val, nil
}

func (d vfioDevice) IsMigEnabled() (bool, error) {
return false, nil
}

func (d vfioDevice) IsMigCapable() (bool, error) {
return false, nil
}
75 changes: 75 additions & 0 deletions internal/resource/sysfs-lib.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package resource

import (
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"k8s.io/klog/v2"
)

type vfioLib struct {
nvpcilib nvpci.Interface
}

// NewVfioManager returns an resource manger for devices with VFIO PCI driver
func NewVfioManager() Manager {
nvpcilib := nvpci.New()
manager := vfioLib{
nvpcilib: nvpcilib,
}
return &manager
}

// Init is a no-op for the vfio manager
func (l *vfioLib) Init() error {
return nil
}

// Shutdown is a no-op for the vfio manager
func (l *vfioLib) Shutdown() (err error) {
return nil
}

// GetDevices returns the devices with VFIO PCI driver available on the system
func (l *vfioLib) GetDevices() ([]Device, error) {
var devices []Device
nvdevices, err := l.nvpcilib.GetGPUs()
if err != nil {
return nil, err
}

for _, dev := range nvdevices {
if dev.Driver == "vfio-pci" {
vfioDev := vfioDevice{dev}
devices = append(devices, vfioDev)
} else {
klog.Infof("Device not bound to 'vfio-pci'; device: %s driver: '%s'", dev.Address, dev.Driver)
}
}
return devices, nil
}

// GetCudaDriverVersion is not supported
func (l *vfioLib) GetCudaDriverVersion() (*uint, *uint, error) {
unknown := uint(0)
return &unknown, &unknown, nil
}

// GetDriverVersion is not supported
func (l *vfioLib) GetDriverVersion() (string, error) {
return "unknown.unknown.unknown", nil
}
94 changes: 94 additions & 0 deletions vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes/bytes.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c2ab8a7

Please sign in to comment.