From 3ebe8ea500958b0cc50dba40611dce9d805c4de3 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 6 Dec 2024 13:54:53 +0100 Subject: [PATCH 1/4] [no-relnote] Fix error message Signed-off-by: Evan Lezar --- pkg/nvcdi/lib-nvml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/nvcdi/lib-nvml.go b/pkg/nvcdi/lib-nvml.go index 01c22ff37..c940b090d 100644 --- a/pkg/nvcdi/lib-nvml.go +++ b/pkg/nvcdi/lib-nvml.go @@ -37,7 +37,7 @@ var _ Interface = (*nvmllib)(nil) // GetSpec should not be called for nvmllib func (l *nvmllib) GetSpec() (spec.Interface, error) { - return nil, fmt.Errorf("Unexpected call to nvmllib.GetSpec()") + return nil, fmt.Errorf("unexpected call to nvmllib.GetSpec()") } // GetAllDeviceSpecs returns the device specs for all available devices. From 1172f2786c92cbe4cd1eb2c40f2331d4636b78a8 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 6 Dec 2024 13:54:35 +0100 Subject: [PATCH 2/4] [no-relnote] Move nvcdi wrapper to separate file Signed-off-by: Evan Lezar --- pkg/nvcdi/lib.go | 44 --------------------------- pkg/nvcdi/wrapper.go | 72 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 44 deletions(-) create mode 100644 pkg/nvcdi/wrapper.go diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 8ed9e5aa0..efabded4b 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -22,26 +22,14 @@ import ( "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" "github.com/NVIDIA/go-nvml/pkg/nvml" - "tags.cncf.io/container-device-interface/pkg/cdi" - "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" - "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" ) -type wrapper struct { - Interface - - vendor string - class string - - mergedDeviceOptions []transform.MergedDeviceOption -} - type nvcdilib struct { logger logger.Interface nvmllib nvml.Interface @@ -174,38 +162,6 @@ func New(opts ...Option) (Interface, error) { return &w, nil } -// GetSpec combines the device specs and common edits from the wrapped Interface to a single spec.Interface. -func (l *wrapper) GetSpec() (spec.Interface, error) { - deviceSpecs, err := l.GetAllDeviceSpecs() - if err != nil { - return nil, err - } - - edits, err := l.GetCommonEdits() - if err != nil { - return nil, err - } - - return spec.New( - spec.WithDeviceSpecs(deviceSpecs), - spec.WithEdits(*edits.ContainerEdits), - spec.WithVendor(l.vendor), - spec.WithClass(l.class), - spec.WithMergedDeviceOptions(l.mergedDeviceOptions...), - ) -} - -// GetCommonEdits returns the wrapped edits and adds additional edits on top. -func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) { - edits, err := m.Interface.GetCommonEdits() - if err != nil { - return nil, err - } - edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void") - - return edits, nil -} - // resolveMode resolves the mode for CDI spec generation based on the current system. func (l *nvcdilib) resolveMode() (rmode string) { if l.mode != ModeAuto { diff --git a/pkg/nvcdi/wrapper.go b/pkg/nvcdi/wrapper.go new file mode 100644 index 000000000..368f7dc95 --- /dev/null +++ b/pkg/nvcdi/wrapper.go @@ -0,0 +1,72 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "tags.cncf.io/container-device-interface/pkg/cdi" + "tags.cncf.io/container-device-interface/specs-go" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" +) + +type wrapper struct { + Interface + + vendor string + class string + + mergedDeviceOptions []transform.MergedDeviceOption +} + +// GetSpec combines the device specs and common edits from the wrapped Interface to a single spec.Interface. +func (l *wrapper) GetSpec() (spec.Interface, error) { + deviceSpecs, err := l.GetAllDeviceSpecs() + if err != nil { + return nil, err + } + + edits, err := l.GetCommonEdits() + if err != nil { + return nil, err + } + + return spec.New( + spec.WithDeviceSpecs(deviceSpecs), + spec.WithEdits(*edits.ContainerEdits), + spec.WithVendor(l.vendor), + spec.WithClass(l.class), + spec.WithMergedDeviceOptions(l.mergedDeviceOptions...), + ) +} + +// GetAllDeviceSpecs returns the device specs for all available devices. +func (l *wrapper) GetAllDeviceSpecs() ([]specs.Device, error) { + return l.Interface.GetAllDeviceSpecs() +} + +// GetCommonEdits returns the wrapped edits and adds additional edits on top. +func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) { + edits, err := m.Interface.GetCommonEdits() + if err != nil { + return nil, err + } + edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void") + + return edits, nil +} From 63f4fe1de1e92fcef14b629eb1b0da72398fd712 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 6 Dec 2024 10:17:46 +0100 Subject: [PATCH 3/4] [no-relnote] Refactor CDI version extraction Signed-off-by: Evan Lezar --- pkg/nvcdi/common-nvml.go | 4 ++-- pkg/nvcdi/driver-nvml.go | 17 +---------------- pkg/nvcdi/lib-nvml.go | 20 +++++++++++++++++++- pkg/nvcdi/lib.go | 35 ++++++++++++++++++++++++++++------- pkg/nvcdi/management.go | 24 +----------------------- 5 files changed, 51 insertions(+), 49 deletions(-) diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index 4dd1bc357..1acb309aa 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -24,7 +24,7 @@ import ( // newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. // This includes driver libraries and meta devices, for example. -func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { +func (l *nvmllib) newCommonNVMLDiscoverer(version string) (discover.Discover, error) { metaDevices := discover.NewCharDeviceDiscoverer( l.logger, l.devRoot, @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err) } - driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib) + driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) } diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 007ead28b..519c6c36f 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -22,7 +22,6 @@ import ( "path/filepath" "strings" - "github.com/NVIDIA/go-nvml/pkg/nvml" "golang.org/x/sys/unix" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" @@ -34,21 +33,7 @@ import ( // NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. // The supplied NVML Library is used to query the expected driver version. -func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) { - if r := nvmllib.Init(); r != nvml.SUCCESS { - return nil, fmt.Errorf("failed to initialize NVML: %v", r) - } - defer func() { - if r := nvmllib.Shutdown(); r != nvml.SUCCESS { - logger.Warningf("failed to shutdown NVML: %v", r) - } - }() - - version, r := nvmllib.SystemGetDriverVersion() - if r != nvml.SUCCESS { - return nil, fmt.Errorf("failed to determine driver version: %v", r) - } - +func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, version string) (discover.Discover, error) { return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) } diff --git a/pkg/nvcdi/lib-nvml.go b/pkg/nvcdi/lib-nvml.go index c940b090d..3c8c3ee3e 100644 --- a/pkg/nvcdi/lib-nvml.go +++ b/pkg/nvcdi/lib-nvml.go @@ -83,7 +83,25 @@ func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) { // GetCommonEdits generates a CDI specification that can be used for ANY devices func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) { - common, err := l.newCommonNVMLDiscoverer() + if l.nvsandboxutilslib != nil { + if r := l.nvsandboxutilslib.Init(l.driverRoot); r != nvsandboxutils.SUCCESS { + l.logger.Warningf("Failed to init nvsandboxutils: %v; ignoring", r) + l.nvsandboxutilslib = nil + } + defer func() { + if l.nvsandboxutilslib == nil { + return + } + _ = l.nvsandboxutilslib.Shutdown() + }() + } + + version, err := (*nvcdilib)(l).getDriverVersion() + if err != nil { + return nil, fmt.Errorf("failed to get driver version: %v", err) + } + + common, err := l.newCommonNVMLDiscoverer(version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) } diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index efabded4b..4c4741d4c 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -18,12 +18,15 @@ package nvcdi import ( "fmt" + "path/filepath" + "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" @@ -184,18 +187,36 @@ func (l *nvcdilib) resolveMode() (rmode string) { return ModeNvml } -// getCudaVersion returns the CUDA version of the current system. -func (l *nvcdilib) getCudaVersion() (string, error) { - version, err := l.getCudaVersionNvsandboxutils() - if err == nil { +// getDriverVersion returns the driver version of the current system. +func (l *nvcdilib) getDriverVersion() (string, error) { + if version, err := l.getDriverVersionNvsandboxutils(); err == nil && version != "" { return version, err } // Fallback to NVML - return l.getCudaVersionNvml() + if version, err := l.getDriverVersionNvml(); err == nil && version != "" { + return version, err + } + + // Fallback to getting the version from the libcuda.so suffix. + return l.getDriverVersionLibcudaSo() +} + +func (l *nvcdilib) getDriverVersionLibcudaSo() (string, error) { + libCudaPaths, err := cuda.New( + l.driver.Libraries(), + ).Locate(".*.*") + if err != nil { + return "", fmt.Errorf("failed to locate libcuda.so: %v", err) + } + libCudaPath := libCudaPaths[0] + + version := strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.") + + return version, nil } -func (l *nvcdilib) getCudaVersionNvml() (string, error) { +func (l *nvcdilib) getDriverVersionNvml() (string, error) { if hasNVML, reason := l.infolib.HasNvml(); !hasNVML { return "", fmt.Errorf("nvml not detected: %v", reason) } @@ -219,7 +240,7 @@ func (l *nvcdilib) getCudaVersionNvml() (string, error) { return version, nil } -func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) { +func (l *nvcdilib) getDriverVersionNvsandboxutils() (string, error) { if l.nvsandboxutilslib == nil { return "", fmt.Errorf("libnvsandboxutils is not available") } diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index dee63a147..b11f43103 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -27,7 +27,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" ) @@ -75,7 +74,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { }() } - version, err := m.getCudaVersion() + version, err := (*nvcdilib)(m).getDriverVersion() if err != nil { return nil, fmt.Errorf("failed to get CUDA version: %v", err) } @@ -93,27 +92,6 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return edits, nil } -// getCudaVersion returns the CUDA version for use in managementlib containers. -func (m *managementlib) getCudaVersion() (string, error) { - version, err := (*nvcdilib)(m).getCudaVersion() - if err == nil { - return version, nil - } - - libCudaPaths, err := cuda.New( - m.driver.Libraries(), - ).Locate(".*.*") - if err != nil { - return "", fmt.Errorf("failed to locate libcuda.so: %v", err) - } - - libCudaPath := libCudaPaths[0] - - version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.") - - return version, nil -} - type managementDiscoverer struct { discover.Discover } From 841c7d18371c082a8e942c83df90ec2289ca0d4f Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 5 Dec 2024 18:06:04 +0100 Subject: [PATCH 4/4] [no-relnote] Refactor driver library discovery This change aligns the driver file discovery with device discovery and allows other sources such as nvsandboxutils to be added. Signed-off-by: Evan Lezar --- .../gen/nvsandboxutils/nvsandboxutils.yml | 1 + .../platform-support/dgpu}/driver-nvml.go | 76 ++++++++----------- .../dgpu/driver-nvsandboxutils.go | 31 ++++++++ internal/platform-support/dgpu/driver.go | 74 ++++++++++++++++++ internal/platform-support/dgpu/options.go | 25 ++++++ pkg/nvcdi/common-nvml.go | 11 ++- pkg/nvcdi/management.go | 13 +++- 7 files changed, 182 insertions(+), 49 deletions(-) rename {pkg/nvcdi => internal/platform-support/dgpu}/driver-nvml.go (66%) create mode 100644 internal/platform-support/dgpu/driver-nvsandboxutils.go create mode 100644 internal/platform-support/dgpu/driver.go diff --git a/internal/nvsandboxutils/gen/nvsandboxutils/nvsandboxutils.yml b/internal/nvsandboxutils/gen/nvsandboxutils/nvsandboxutils.yml index 851db7436..760616dc9 100644 --- a/internal/nvsandboxutils/gen/nvsandboxutils/nvsandboxutils.yml +++ b/internal/nvsandboxutils/gen/nvsandboxutils/nvsandboxutils.yml @@ -49,6 +49,7 @@ TRANSLATOR: const: - {action: accept, from: "^NVSANDBOXUTILS_"} - {action: accept, from: "^nvSandboxUtils"} + - {action: replace, from: "^NVSANDBOXUTILS_255_MASK_", to: "MASK255_" } - {action: replace, from: "^NVSANDBOXUTILS_"} - {action: replace, from: "^nvSandboxUtils"} - {action: accept, from: "^NV"} diff --git a/pkg/nvcdi/driver-nvml.go b/internal/platform-support/dgpu/driver-nvml.go similarity index 66% rename from pkg/nvcdi/driver-nvml.go rename to internal/platform-support/dgpu/driver-nvml.go index 519c6c36f..2761e4457 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/internal/platform-support/dgpu/driver-nvml.go @@ -14,7 +14,7 @@ # limitations under the License. **/ -package nvcdi +package dgpu import ( "fmt" @@ -31,33 +31,22 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" ) -// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. -// The supplied NVML Library is used to query the expected driver version. -func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, version string) (discover.Discover, error) { - return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) -} - -func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) +// newNvmlDriverDiscoverer constructs a discoverer from the specified NVML library. +func (o *options) newNvmlDriverDiscoverer() (discover.Discover, error) { + libraries, err := o.newNvmlDriverLibraryDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) } - ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root) - if err != nil { - return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) - } - - firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version) + firmwares, err := o.newNvmlDriverFirmwareDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err) } - binaries := NewDriverBinariesDiscoverer(logger, driver.Root) + binaries := o.newNvmlDriverBinariesDiscoverer() d := discover.Merge( libraries, - ipcs, firmwares, binaries, ) @@ -65,32 +54,27 @@ func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nv return d, nil } -// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version. -func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraryPaths, err := getVersionLibs(logger, driver, version) +// newNvmlDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version. +func (o *options) newNvmlDriverLibraryDiscoverer() (discover.Discover, error) { + libraryPaths, err := getVersionLibs(o.logger, o.driver, o.version) if err != nil { return nil, fmt.Errorf("failed to get libraries for driver version: %v", err) } libraries := discover.NewMounts( - logger, + o.logger, lookup.NewFileLocator( - lookup.WithLogger(logger), - lookup.WithRoot(driver.Root), + lookup.WithLogger(o.logger), + lookup.WithRoot(o.driver.Root), ), - driver.Root, + o.driver.Root, libraryPaths, ) - updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath) - - d := discover.Merge( - discover.WithDriverDotSoSymlinks( - libraries, - version, - nvidiaCDIHookPath, - ), - updateLDCache, + d := discover.WithDriverDotSoSymlinks( + libraries, + o.version, + o.nvidiaCDIHookPath, ) return d, nil @@ -138,31 +122,31 @@ func getCustomFirmwareClassPath(logger logger.Interface) string { return strings.TrimSpace(string(customFirmwareClassPath)) } -// NewDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version. -func NewDriverFirmwareDiscoverer(logger logger.Interface, driverRoot string, version string) (discover.Discover, error) { - gspFirmwareSearchPaths, err := getFirmwareSearchPaths(logger) +// newNvmlDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version. +func (o *options) newNvmlDriverFirmwareDiscoverer() (discover.Discover, error) { + gspFirmwareSearchPaths, err := getFirmwareSearchPaths(o.logger) if err != nil { return nil, fmt.Errorf("failed to get firmware search paths: %v", err) } - gspFirmwarePaths := filepath.Join("nvidia", version, "gsp*.bin") + gspFirmwarePaths := filepath.Join("nvidia", o.version, "gsp*.bin") return discover.NewMounts( - logger, + o.logger, lookup.NewFileLocator( - lookup.WithLogger(logger), - lookup.WithRoot(driverRoot), + lookup.WithLogger(o.logger), + lookup.WithRoot(o.driver.Root), lookup.WithSearchPaths(gspFirmwareSearchPaths...), ), - driverRoot, + o.driver.Root, []string{gspFirmwarePaths}, ), nil } -// NewDriverBinariesDiscoverer creates a discoverer for GSP firmware associated with the GPU driver. -func NewDriverBinariesDiscoverer(logger logger.Interface, driverRoot string) discover.Discover { +// newNvmlDriverBinariesDiscoverer creates a discoverer for binaries associated with the specified driver version. +func (o *options) newNvmlDriverBinariesDiscoverer() discover.Discover { return discover.NewMounts( - logger, - lookup.NewExecutableLocator(logger, driverRoot), - driverRoot, + o.logger, + lookup.NewExecutableLocator(o.logger, o.driver.Root), + o.driver.Root, []string{ "nvidia-smi", /* System management interface */ "nvidia-debugdump", /* GPU coredump utility */ diff --git a/internal/platform-support/dgpu/driver-nvsandboxutils.go b/internal/platform-support/dgpu/driver-nvsandboxutils.go new file mode 100644 index 000000000..713260833 --- /dev/null +++ b/internal/platform-support/dgpu/driver-nvsandboxutils.go @@ -0,0 +1,31 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" +) + +// newNvsandboxutilsDriverDiscoverer constructs a discoverer from the specified nvsandboxutils library. +func (o *options) newNvsandboxutilsDriverDiscoverer() (discover.Discover, error) { + if o.nvsandboxutilslib == nil { + return nil, nil + } + return nil, fmt.Errorf("nvsandboxutils driver discovery is not implemented") +} diff --git a/internal/platform-support/dgpu/driver.go b/internal/platform-support/dgpu/driver.go new file mode 100644 index 000000000..2533edfb6 --- /dev/null +++ b/internal/platform-support/dgpu/driver.go @@ -0,0 +1,74 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "errors" + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" +) + +// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. +func NewDriverDiscoverer(opts ...Option) (discover.Discover, error) { + o := new(opts...) + + if o.version == "" { + return nil, fmt.Errorf("a version must be specified") + } + + var discoverers []discover.Discover + var errs error + + nvsandboxutilsDiscoverer, err := o.newNvsandboxutilsDriverDiscoverer() + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvsandboxutilsDiscoverer != nil { + discoverers = append(discoverers, nvsandboxutilsDiscoverer) + } + + nvmlDiscoverer, err := o.newNvmlDriverDiscoverer() + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvmlDiscoverer != nil { + discoverers = append(discoverers, nvmlDiscoverer) + } + + if len(discoverers) == 0 { + return nil, errs + } + + cached := discover.WithCache( + discover.FirstValid( + discoverers..., + ), + ) + updateLDCache, _ := discover.NewLDCacheUpdateHook(o.logger, cached, o.nvidiaCDIHookPath, o.ldconfigPath) + + ipcs, err := discover.NewIPCDiscoverer(o.logger, o.driver.Root) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) + } + + return discover.Merge( + cached, + updateLDCache, + ipcs, + ), nil +} diff --git a/internal/platform-support/dgpu/options.go b/internal/platform-support/dgpu/options.go index 2fd1c01bd..e8e3bac36 100644 --- a/internal/platform-support/dgpu/options.go +++ b/internal/platform-support/dgpu/options.go @@ -18,13 +18,16 @@ package dgpu import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" ) type options struct { logger logger.Interface + driver *root.Driver devRoot string + ldconfigPath string nvidiaCDIHookPath string isMigDevice bool @@ -33,6 +36,9 @@ type options struct { migCaps nvcaps.MigCaps migCapsError error + // version stores the driver version. + version string + nvsandboxutilslib nvsandboxutils.Interface } @@ -45,6 +51,19 @@ func WithDevRoot(root string) Option { } } +func WithDriver(driver *root.Driver) Option { + return func(l *options) { + l.driver = driver + } +} + +// WithLdconfigPath sets the path to the ldconfig program +func WithLdconfigPath(path string) Option { + return func(l *options) { + l.ldconfigPath = path + } +} + // WithLogger sets the logger for the library func WithLogger(logger logger.Interface) Option { return func(l *options) { @@ -72,3 +91,9 @@ func WithNvsandboxuitilsLib(nvsandboxutilslib nvsandboxutils.Interface) Option { l.nvsandboxutilslib = nvsandboxutilslib } } + +func WithVersion(version string) Option { + return func(l *options) { + l.version = version + } +} diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index 1acb309aa..17ac87769 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -20,6 +20,7 @@ import ( "fmt" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" ) // newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. @@ -41,7 +42,15 @@ func (l *nvmllib) newCommonNVMLDiscoverer(version string) (discover.Discover, er l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err) } - driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version) + driverFiles, err := dgpu.NewDriverDiscoverer( + dgpu.WithDevRoot(l.devRoot), + dgpu.WithDriver(l.driver), + dgpu.WithLdconfigPath(l.ldconfigPath), + dgpu.WithLogger(l.logger), + dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), + dgpu.WithNvsandboxuitilsLib(l.nvsandboxutilslib), + dgpu.WithVersion(version), + ) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) } diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index b11f43103..38fbae475 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -28,6 +28,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" ) @@ -76,10 +77,18 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { version, err := (*nvcdilib)(m).getDriverVersion() if err != nil { - return nil, fmt.Errorf("failed to get CUDA version: %v", err) + return nil, fmt.Errorf("failed to get driver version: %v", err) } - driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version) + driver, err := dgpu.NewDriverDiscoverer( + dgpu.WithDevRoot(m.devRoot), + dgpu.WithDriver(m.driver), + dgpu.WithLdconfigPath(m.ldconfigPath), + dgpu.WithLogger(m.logger), + dgpu.WithNVIDIACDIHookPath(m.nvidiaCDIHookPath), + dgpu.WithNvsandboxuitilsLib(m.nvsandboxutilslib), + dgpu.WithVersion(version), + ) if err != nil { return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) }