diff --git a/src/cmd/run.go b/src/cmd/run.go index 39ac8f0..719c0d6 100644 --- a/src/cmd/run.go +++ b/src/cmd/run.go @@ -269,7 +269,14 @@ func runCommand(container string, cdiSpecForNvidia, err := nvidia.GenerateCDISpec() if err != nil { - if !errors.Is(err, nvidia.ErrPlatformUnsupported) { + if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) { + var builder strings.Builder + fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n") + fmt.Fprintf(&builder, "Check the host operating system and systemd journal.") + + errMsg := builder.String() + return errors.New(errMsg) + } else if !errors.Is(err, nvidia.ErrPlatformUnsupported) { return err } } else { diff --git a/src/go.mod b/src/go.mod index 36e8d50..d6c6055 100644 --- a/src/go.mod +++ b/src/go.mod @@ -5,6 +5,7 @@ go 1.20 require ( github.com/HarryMichal/go-version v1.0.1 github.com/NVIDIA/go-nvlib v0.6.1 + github.com/NVIDIA/go-nvml v0.12.4-0 github.com/NVIDIA/nvidia-container-toolkit v1.16.1 github.com/acobaugh/osrelease v0.1.0 github.com/briandowns/spinner v1.18.0 @@ -23,7 +24,6 @@ require ( ) require ( - github.com/NVIDIA/go-nvml v0.12.4-0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fatih/color v1.13.0 // indirect github.com/google/uuid v1.6.0 // indirect diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index 16daf2f..fdb9240 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -21,6 +21,7 @@ import ( "io" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/sirupsen/logrus" @@ -32,7 +33,8 @@ var ( ) var ( - ErrPlatformUnsupported = errors.New("platform is unsupported") + ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch") + ErrPlatformUnsupported = errors.New("platform is unsupported") ) func createNullLogger() *logrus.Logger { @@ -52,7 +54,8 @@ func GenerateCDISpec() (*specs.Spec, error) { logger = logrus.StandardLogger() } - info := info.New(info.WithLogger(logger)) + nvmLib := nvml.New() + info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib)) if ok, reason := info.HasDXCore(); ok { logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason) @@ -60,7 +63,18 @@ func GenerateCDISpec() (*specs.Spec, error) { } hasNvml, reason := info.HasNvml() - if !hasNvml { + if hasNvml { + if err := nvmLib.Init(); err != nvml.SUCCESS { + logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s", + err) + + if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH { + return nil, ErrNVMLDriverLibraryVersionMismatch + } else { + return nil, errors.New("failed to initialize NVIDIA Management Library") + } + } + } else { logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s", reason) } @@ -75,7 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) { return nil, ErrPlatformUnsupported } - cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger)) + cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib)) if err != nil { logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) return nil, errors.New("failed to create Container Device Interface library for NVIDIA")