-
Notifications
You must be signed in to change notification settings - Fork 296
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #552 from elezar/refactor-dgpu-discovery
Refactor dGPU device discovery
- Loading branch information
Showing
7 changed files
with
499 additions
and
219 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/** | ||
# Copyright 2024 NVIDIA CORPORATION | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
**/ | ||
|
||
package dgpu | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
|
||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger" | ||
) | ||
|
||
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links | ||
type byPathHookDiscoverer struct { | ||
logger logger.Interface | ||
devRoot string | ||
nvidiaCDIHookPath string | ||
pciBusID string | ||
deviceNodes discover.Discover | ||
} | ||
|
||
var _ discover.Discover = (*byPathHookDiscoverer)(nil) | ||
|
||
// Devices returns the empty list for the by-path hook discoverer | ||
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) { | ||
return nil, nil | ||
} | ||
|
||
// Hooks returns the hooks for the GPU device. | ||
// The following hooks are detected: | ||
// 1. A hook to create /dev/dri/by-path symlinks | ||
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) { | ||
links, err := d.deviceNodeLinks() | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to discover DRA device links: %v", err) | ||
} | ||
if len(links) == 0 { | ||
return nil, nil | ||
} | ||
|
||
var args []string | ||
for _, l := range links { | ||
args = append(args, "--link", l) | ||
} | ||
|
||
hook := discover.CreateNvidiaCDIHook( | ||
d.nvidiaCDIHookPath, | ||
"create-symlinks", | ||
args..., | ||
) | ||
|
||
return []discover.Hook{hook}, nil | ||
} | ||
|
||
// Mounts returns an empty slice for a full GPU | ||
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) { | ||
return nil, nil | ||
} | ||
|
||
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { | ||
devices, err := d.deviceNodes.Devices() | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to discover device nodes: %v", err) | ||
} | ||
|
||
if len(devices) == 0 { | ||
return nil, nil | ||
} | ||
|
||
selectedDevices := make(map[string]bool) | ||
for _, d := range devices { | ||
selectedDevices[d.HostPath] = true | ||
} | ||
|
||
candidates := []string{ | ||
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), | ||
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), | ||
} | ||
|
||
var links []string | ||
for _, c := range candidates { | ||
linkPath := filepath.Join(d.devRoot, c) | ||
device, err := os.Readlink(linkPath) | ||
if err != nil { | ||
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) | ||
continue | ||
} | ||
|
||
deviceNode := device | ||
if !filepath.IsAbs(device) { | ||
deviceNode = filepath.Join(filepath.Dir(linkPath), device) | ||
} | ||
if !selectedDevices[deviceNode] { | ||
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode) | ||
continue | ||
} | ||
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) | ||
links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) | ||
} | ||
|
||
return links, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/** | ||
# Copyright 2024 NVIDIA CORPORATION | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
**/ | ||
|
||
package dgpu | ||
|
||
import ( | ||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device" | ||
|
||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger" | ||
) | ||
|
||
// NewForDevice creates a discoverer for the specified Device. | ||
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { | ||
o := &options{} | ||
for _, opt := range opts { | ||
opt(o) | ||
} | ||
|
||
if o.logger == nil { | ||
o.logger = logger.New() | ||
} | ||
|
||
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) | ||
} | ||
|
||
// NewForDevice creates a discoverer for the specified device and its associated MIG device. | ||
func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) { | ||
o := &options{} | ||
for _, opt := range opts { | ||
opt(o) | ||
} | ||
|
||
if o.logger == nil { | ||
o.logger = logger.New() | ||
} | ||
|
||
return o.newNvmlMigDiscoverer( | ||
&toRequiredMigInfo{ | ||
MigDevice: mig, | ||
parent: &toRequiredInfo{d}, | ||
}, | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
/** | ||
# Copyright 2024 NVIDIA CORPORATION | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
**/ | ||
|
||
package dgpu | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device" | ||
"github.com/NVIDIA/go-nvml/pkg/nvml" | ||
|
||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" | ||
) | ||
|
||
type requiredInfo interface { | ||
GetMinorNumber() (int, error) | ||
GetPCIBusID() (string, error) | ||
getDevNodePath() (string, error) | ||
} | ||
|
||
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) { | ||
path, err := d.getDevNodePath() | ||
if err != nil { | ||
return nil, fmt.Errorf("error getting device node path: %w", err) | ||
} | ||
|
||
pciBusID, err := d.GetPCIBusID() | ||
if err != nil { | ||
return nil, fmt.Errorf("error getting PCI info for device: %w", err) | ||
} | ||
|
||
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) | ||
} | ||
|
||
deviceNodePaths := append([]string{path}, drmDeviceNodes...) | ||
|
||
deviceNodes := discover.NewCharDeviceDiscoverer( | ||
o.logger, | ||
o.devRoot, | ||
deviceNodePaths, | ||
) | ||
|
||
byPathHooks := &byPathHookDiscoverer{ | ||
logger: o.logger, | ||
devRoot: o.devRoot, | ||
nvidiaCDIHookPath: o.nvidiaCDIHookPath, | ||
pciBusID: pciBusID, | ||
deviceNodes: deviceNodes, | ||
} | ||
|
||
dd := discover.Merge( | ||
deviceNodes, | ||
byPathHooks, | ||
) | ||
return dd, nil | ||
} | ||
|
||
type requiredMigInfo interface { | ||
getPlacementInfo() (int, int, int, error) | ||
getDevNodePath() (string, error) | ||
} | ||
|
||
func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) { | ||
gpu, gi, ci, err := d.getPlacementInfo() | ||
if err != nil { | ||
return nil, fmt.Errorf("error getting placement info: %w", err) | ||
} | ||
|
||
migCaps, err := nvcaps.NewMigCaps() | ||
if err != nil { | ||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) | ||
} | ||
|
||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi) | ||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err) | ||
} | ||
|
||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) | ||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err) | ||
} | ||
|
||
parentPath, err := d.getDevNodePath() | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
deviceNodes := discover.NewCharDeviceDiscoverer( | ||
o.logger, | ||
o.devRoot, | ||
[]string{ | ||
parentPath, | ||
giCapDevicePath, | ||
ciCapDevicePath, | ||
}, | ||
) | ||
|
||
return deviceNodes, nil | ||
} | ||
|
||
type toRequiredInfo struct { | ||
device.Device | ||
} | ||
|
||
func (d *toRequiredInfo) GetMinorNumber() (int, error) { | ||
minor, ret := d.Device.GetMinorNumber() | ||
if ret != nvml.SUCCESS { | ||
return 0, ret | ||
} | ||
return minor, nil | ||
} | ||
|
||
func (d *toRequiredInfo) getDevNodePath() (string, error) { | ||
minor, err := d.GetMinorNumber() | ||
if err != nil { | ||
return "", fmt.Errorf("error getting GPU device minor number: %w", err) | ||
} | ||
path := fmt.Sprintf("/dev/nvidia%d", minor) | ||
return path, nil | ||
} | ||
|
||
type toRequiredMigInfo struct { | ||
device.MigDevice | ||
parent requiredInfo | ||
} | ||
|
||
func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) { | ||
gpu, ret := d.parent.GetMinorNumber() | ||
if ret != nvml.SUCCESS { | ||
return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret) | ||
} | ||
|
||
gi, ret := d.GetGpuInstanceId() | ||
if ret != nvml.SUCCESS { | ||
return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret) | ||
} | ||
|
||
ci, ret := d.GetComputeInstanceId() | ||
if ret != nvml.SUCCESS { | ||
return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret) | ||
} | ||
|
||
return gpu, gi, ci, nil | ||
} | ||
|
||
func (d *toRequiredMigInfo) getDevNodePath() (string, error) { | ||
return d.parent.getDevNodePath() | ||
} |
Oops, something went wrong.