cmd/run, pkg/nvidia: Detect mismatched NVIDIA kernel & user space driver
The proprietary NVIDIA driver has a kernel space part and a user space
part, and they must always have the same matching version. Sometimes,
the host operating system might end up with mismatched parts. One
reason could be that the different third-party repositories used to
distribute the driver might be incompatible with each other. eg., in
the case of Fedora it could be RPM Fusion and NVIDIA's own repository.
This shows up in the systemd journal as:
$ journalctl --dmesg
...
kernel: NVRM: API mismatch: the client has the version 555.58.02, but
NVRM: this kernel module has the version 560.35.03. Please
NVRM: make sure that this kernel module and all NVIDIA driver
NVRM: components have the same version.
...
Without any special handling of this scenario, users would be presented
with a very misleading error:
$ toolbox enter
Error: failed to get Container Device Interface containerEdits for
NVIDIA
Instead, improve the error message to be more self-documenting:
$ toolbox enter
Error: the proprietary NVIDIA driver's kernel and user space don't
match
Check the host operating system and systemd journal.
https://github.com/containers/toolbox/pull/1541
This commit is contained in:
parent
977c3d98a4
commit
8dd2f8e80a
|
|
@ -269,7 +269,14 @@ func runCommand(container string,
|
||||||
|
|
||||||
cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
|
cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
|
if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) {
|
||||||
|
var builder strings.Builder
|
||||||
|
fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n")
|
||||||
|
fmt.Fprintf(&builder, "Check the host operating system and systemd journal.")
|
||||||
|
|
||||||
|
errMsg := builder.String()
|
||||||
|
return errors.New(errMsg)
|
||||||
|
} else if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ go 1.20
|
||||||
require (
|
require (
|
||||||
github.com/HarryMichal/go-version v1.0.1
|
github.com/HarryMichal/go-version v1.0.1
|
||||||
github.com/NVIDIA/go-nvlib v0.6.1
|
github.com/NVIDIA/go-nvlib v0.6.1
|
||||||
|
github.com/NVIDIA/go-nvml v0.12.4-0
|
||||||
github.com/NVIDIA/nvidia-container-toolkit v1.16.1
|
github.com/NVIDIA/nvidia-container-toolkit v1.16.1
|
||||||
github.com/acobaugh/osrelease v0.1.0
|
github.com/acobaugh/osrelease v0.1.0
|
||||||
github.com/briandowns/spinner v1.18.0
|
github.com/briandowns/spinner v1.18.0
|
||||||
|
|
@ -23,7 +24,6 @@ require (
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/NVIDIA/go-nvml v0.12.4-0 // indirect
|
|
||||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||||
github.com/fatih/color v1.13.0 // indirect
|
github.com/fatih/color v1.13.0 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||||
nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
|
@ -32,7 +33,8 @@ var (
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrPlatformUnsupported = errors.New("platform is unsupported")
|
ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch")
|
||||||
|
ErrPlatformUnsupported = errors.New("platform is unsupported")
|
||||||
)
|
)
|
||||||
|
|
||||||
func createNullLogger() *logrus.Logger {
|
func createNullLogger() *logrus.Logger {
|
||||||
|
|
@ -52,7 +54,8 @@ func GenerateCDISpec() (*specs.Spec, error) {
|
||||||
logger = logrus.StandardLogger()
|
logger = logrus.StandardLogger()
|
||||||
}
|
}
|
||||||
|
|
||||||
info := info.New(info.WithLogger(logger))
|
nvmLib := nvml.New()
|
||||||
|
info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib))
|
||||||
|
|
||||||
if ok, reason := info.HasDXCore(); ok {
|
if ok, reason := info.HasDXCore(); ok {
|
||||||
logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason)
|
logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason)
|
||||||
|
|
@ -60,7 +63,18 @@ func GenerateCDISpec() (*specs.Spec, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
hasNvml, reason := info.HasNvml()
|
hasNvml, reason := info.HasNvml()
|
||||||
if !hasNvml {
|
if hasNvml {
|
||||||
|
if err := nvmLib.Init(); err != nvml.SUCCESS {
|
||||||
|
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s",
|
||||||
|
err)
|
||||||
|
|
||||||
|
if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH {
|
||||||
|
return nil, ErrNVMLDriverLibraryVersionMismatch
|
||||||
|
} else {
|
||||||
|
return nil, errors.New("failed to initialize NVIDIA Management Library")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s",
|
logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s",
|
||||||
reason)
|
reason)
|
||||||
}
|
}
|
||||||
|
|
@ -75,7 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) {
|
||||||
return nil, ErrPlatformUnsupported
|
return nil, ErrPlatformUnsupported
|
||||||
}
|
}
|
||||||
|
|
||||||
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger))
|
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err)
|
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err)
|
||||||
return nil, errors.New("failed to create Container Device Interface library for NVIDIA")
|
return nil, errors.New("failed to create Container Device Interface library for NVIDIA")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue