cmd/run, pkg/nvidia: Detect mismatched NVIDIA kernel & user space driver

The proprietary NVIDIA driver has a kernel space part and a user space
part, and they must always have the same matching version.  Sometimes,
the host operating system might end up with mismatched parts.  One
reason could be that the different third-party repositories used to
distribute the driver might be incompatible with each other.  eg., in
the case of Fedora it could be RPM Fusion and NVIDIA's own repository.

This shows up in the systemd journal as:
  $ journalctl --dmesg
  ...
  kernel: NVRM: API mismatch: the client has the version 555.58.02, but
          NVRM: this kernel module has the version 560.35.03.  Please
          NVRM: make sure that this kernel module and all NVIDIA driver
          NVRM: components have the same version.
  ...

Without any special handling of this scenario, users would be presented
with a very misleading error:
  $ toolbox enter
  Error: failed to get Container Device Interface containerEdits for
      NVIDIA

Instead, improve the error message to be more self-documenting:
  $ toolbox enter
  Error: the proprietary NVIDIA driver's kernel and user space don't
      match
  Check the host operating system and systemd journal.

https://github.com/containers/toolbox/pull/1541
This commit is contained in:
Debarshi Ray 2024-08-30 18:24:16 +02:00
parent 977c3d98a4
commit 8dd2f8e80a
3 changed files with 27 additions and 6 deletions

View File

@ -269,7 +269,14 @@ func runCommand(container string,
cdiSpecForNvidia, err := nvidia.GenerateCDISpec() cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
if err != nil { if err != nil {
if !errors.Is(err, nvidia.ErrPlatformUnsupported) { if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) {
var builder strings.Builder
fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n")
fmt.Fprintf(&builder, "Check the host operating system and systemd journal.")
errMsg := builder.String()
return errors.New(errMsg)
} else if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
return err return err
} }
} else { } else {

View File

@ -5,6 +5,7 @@ go 1.20
require ( require (
github.com/HarryMichal/go-version v1.0.1 github.com/HarryMichal/go-version v1.0.1
github.com/NVIDIA/go-nvlib v0.6.1 github.com/NVIDIA/go-nvlib v0.6.1
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/NVIDIA/nvidia-container-toolkit v1.16.1 github.com/NVIDIA/nvidia-container-toolkit v1.16.1
github.com/acobaugh/osrelease v0.1.0 github.com/acobaugh/osrelease v0.1.0
github.com/briandowns/spinner v1.18.0 github.com/briandowns/spinner v1.18.0
@ -23,7 +24,6 @@ require (
) )
require ( require (
github.com/NVIDIA/go-nvml v0.12.4-0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fatih/color v1.13.0 // indirect github.com/fatih/color v1.13.0 // indirect
github.com/google/uuid v1.6.0 // indirect github.com/google/uuid v1.6.0 // indirect

View File

@ -21,6 +21,7 @@ import (
"io" "io"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
@ -32,7 +33,8 @@ var (
) )
var ( var (
ErrPlatformUnsupported = errors.New("platform is unsupported") ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch")
ErrPlatformUnsupported = errors.New("platform is unsupported")
) )
func createNullLogger() *logrus.Logger { func createNullLogger() *logrus.Logger {
@ -52,7 +54,8 @@ func GenerateCDISpec() (*specs.Spec, error) {
logger = logrus.StandardLogger() logger = logrus.StandardLogger()
} }
info := info.New(info.WithLogger(logger)) nvmLib := nvml.New()
info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib))
if ok, reason := info.HasDXCore(); ok { if ok, reason := info.HasDXCore(); ok {
logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason) logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason)
@ -60,7 +63,18 @@ func GenerateCDISpec() (*specs.Spec, error) {
} }
hasNvml, reason := info.HasNvml() hasNvml, reason := info.HasNvml()
if !hasNvml { if hasNvml {
if err := nvmLib.Init(); err != nvml.SUCCESS {
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s",
err)
if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH {
return nil, ErrNVMLDriverLibraryVersionMismatch
} else {
return nil, errors.New("failed to initialize NVIDIA Management Library")
}
}
} else {
logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s", logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s",
reason) reason)
} }
@ -75,7 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) {
return nil, ErrPlatformUnsupported return nil, ErrPlatformUnsupported
} }
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger)) cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib))
if err != nil { if err != nil {
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err)
return nil, errors.New("failed to create Container Device Interface library for NVIDIA") return nil, errors.New("failed to create Container Device Interface library for NVIDIA")