Skip to content

Commit

Permalink
Merge pull request #552 from elezar/refactor-dgpu-discovery
Browse files Browse the repository at this point in the history
Refactor dGPU device discovery
  • Loading branch information
elezar authored Jul 10, 2024
2 parents 9dd4e35 + be11cf4 commit 448a385
Show file tree
Hide file tree
Showing 7 changed files with 499 additions and 219 deletions.
117 changes: 117 additions & 0 deletions internal/platform-support/dgpu/by-path-hooks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package dgpu

import (
"fmt"
"os"
"path/filepath"

"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)

// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
type byPathHookDiscoverer struct {
logger logger.Interface
devRoot string
nvidiaCDIHookPath string
pciBusID string
deviceNodes discover.Discover
}

var _ discover.Discover = (*byPathHookDiscoverer)(nil)

// Devices returns the empty list for the by-path hook discoverer
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
return nil, nil
}

// Hooks returns the hooks for the GPU device.
// The following hooks are detected:
// 1. A hook to create /dev/dri/by-path symlinks
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
links, err := d.deviceNodeLinks()
if err != nil {
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
}
if len(links) == 0 {
return nil, nil
}

var args []string
for _, l := range links {
args = append(args, "--link", l)
}

hook := discover.CreateNvidiaCDIHook(
d.nvidiaCDIHookPath,
"create-symlinks",
args...,
)

return []discover.Hook{hook}, nil
}

// Mounts returns an empty slice for a full GPU
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
return nil, nil
}

func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
devices, err := d.deviceNodes.Devices()
if err != nil {
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
}

if len(devices) == 0 {
return nil, nil
}

selectedDevices := make(map[string]bool)
for _, d := range devices {
selectedDevices[d.HostPath] = true
}

candidates := []string{
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
}

var links []string
for _, c := range candidates {
linkPath := filepath.Join(d.devRoot, c)
device, err := os.Readlink(linkPath)
if err != nil {
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
continue
}

deviceNode := device
if !filepath.IsAbs(device) {
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
}
if !selectedDevices[deviceNode] {
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
continue
}
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
}

return links, nil
}
57 changes: 57 additions & 0 deletions internal/platform-support/dgpu/dgpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package dgpu

import (
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"

"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)

// NewForDevice creates a discoverer for the specified Device.
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
o := &options{}
for _, opt := range opts {
opt(o)
}

if o.logger == nil {
o.logger = logger.New()
}

return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
}

// NewForDevice creates a discoverer for the specified device and its associated MIG device.
func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) {
o := &options{}
for _, opt := range opts {
opt(o)
}

if o.logger == nil {
o.logger = logger.New()
}

return o.newNvmlMigDiscoverer(
&toRequiredMigInfo{
MigDevice: mig,
parent: &toRequiredInfo{d},
},
)
}
168 changes: 168 additions & 0 deletions internal/platform-support/dgpu/nvml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package dgpu

import (
"fmt"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"

"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
)

type requiredInfo interface {
GetMinorNumber() (int, error)
GetPCIBusID() (string, error)
getDevNodePath() (string, error)
}

func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
path, err := d.getDevNodePath()
if err != nil {
return nil, fmt.Errorf("error getting device node path: %w", err)
}

pciBusID, err := d.GetPCIBusID()
if err != nil {
return nil, fmt.Errorf("error getting PCI info for device: %w", err)
}

drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
if err != nil {
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
}

deviceNodePaths := append([]string{path}, drmDeviceNodes...)

deviceNodes := discover.NewCharDeviceDiscoverer(
o.logger,
o.devRoot,
deviceNodePaths,
)

byPathHooks := &byPathHookDiscoverer{
logger: o.logger,
devRoot: o.devRoot,
nvidiaCDIHookPath: o.nvidiaCDIHookPath,
pciBusID: pciBusID,
deviceNodes: deviceNodes,
}

dd := discover.Merge(
deviceNodes,
byPathHooks,
)
return dd, nil
}

type requiredMigInfo interface {
getPlacementInfo() (int, int, int, error)
getDevNodePath() (string, error)
}

func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) {
gpu, gi, ci, err := d.getPlacementInfo()
if err != nil {
return nil, fmt.Errorf("error getting placement info: %w", err)
}

migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
}

giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
if err != nil {
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
}

ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
if err != nil {
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
}

parentPath, err := d.getDevNodePath()
if err != nil {
return nil, err
}

deviceNodes := discover.NewCharDeviceDiscoverer(
o.logger,
o.devRoot,
[]string{
parentPath,
giCapDevicePath,
ciCapDevicePath,
},
)

return deviceNodes, nil
}

type toRequiredInfo struct {
device.Device
}

func (d *toRequiredInfo) GetMinorNumber() (int, error) {
minor, ret := d.Device.GetMinorNumber()
if ret != nvml.SUCCESS {
return 0, ret
}
return minor, nil
}

func (d *toRequiredInfo) getDevNodePath() (string, error) {
minor, err := d.GetMinorNumber()
if err != nil {
return "", fmt.Errorf("error getting GPU device minor number: %w", err)
}
path := fmt.Sprintf("/dev/nvidia%d", minor)
return path, nil
}

type toRequiredMigInfo struct {
device.MigDevice
parent requiredInfo
}

func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) {
gpu, ret := d.parent.GetMinorNumber()
if ret != nvml.SUCCESS {
return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret)
}

gi, ret := d.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret)
}

ci, ret := d.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret)
}

return gpu, gi, ci, nil
}

func (d *toRequiredMigInfo) getDevNodePath() (string, error) {
return d.parent.getDevNodePath()
}
Loading

0 comments on commit 448a385

Please sign in to comment.