Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

integration: add test for deploying Nvidia gpu-operator through Helm chart #907

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lifted 1:1 from:
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
apiVersion: v1
kind: Pod
metadata:
name: {}
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
resources:
limits:
nvidia.com/gpu: 1
174 changes: 174 additions & 0 deletions tests/integration/tests/test_nvidia_gpu_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#
# Copyright 2025 Canonical, Ltd.
#

import logging
from typing import List, Mapping

import pytest
from test_util import config, harness, tags, util

LOG = logging.getLogger(__name__)

NVIDIA_GPU_OPERATOR_HELM_CHART_REPO = "https://helm.ngc.nvidia.com/nvidia"

# Mapping between the versions of the Nvidia `gpu-operator` and
# the host versions of Ubuntu they support.
# Because the `nvidia-driver-daemonset` pod included in the `gpu-operator`
# includes kernel drivers, its container image's release lifecycle is
# strictly tied to the version of Ubuntu on the host.
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html
NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS = {"v24.9.1": ["20.04", "22.04"]}

NVIDIA_KERNEL_MODULE_NAMES = ["nvidia", "nvidia_uvm", "nvidia_modeset"]

# Lifted 1:1 from:
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME = "cuda-vectoradd"


def _check_nvidia_gpu_present(instance: harness.Instance) -> bool:
"""Checks whether at least one Nvidia GPU is available
by exec-ing `lspci` on the target instance."""
proc = instance.exec(["lspci", "-k"], capture_output=True, text=True)

for line in proc.stdout.split("\n"):
if "NVIDIA Corporation" in line:
LOG.info(f"Found NVIDIA GPU in lspci output: {line}")
return True

LOG.info(f"Failed to find NVIDIA GPU in lspci output: {proc.stdout}")
return False


def _check_nvidia_drivers_loaded(instance: harness.Instance) -> Mapping[str, bool]:
"""Ensures that Nvidia kernel modules are NOT loaded on
the given harness instance."""

proc = instance.exec(["lsmod"], capture_output=True, text=True)
modules_present = {m: False for m in NVIDIA_KERNEL_MODULE_NAMES}
for line in proc.stdout.split("\n"):
for mod in modules_present:
if line.startswith(mod):
modules_present[mod] = True

LOG.info(f"Located the following Nvidia kernel modules {modules_present}")
return modules_present


@pytest.mark.node_count(1)
@pytest.mark.tags(tags.WEEKLY)
@pytest.mark.tags(tags.GPU)
@pytest.mark.parametrize(
"gpu_operator_version", NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS.keys()
)
def test_deploy_nvdia_gpu_operator(
instances: List[harness.Instance], gpu_operator_version: str
):
"""Tests that the Nvidia `gpu-operator` can be deployed successfully
using the upstream Helm chart and a sample application running a small
CUDA workload gets scheduled and executed to completion.
"""
instance = instances[0]
test_namespace = "gpu-operator"

# Prechecks to ensure the test instance is valid.
if not _check_nvidia_gpu_present(instance):
msg = (
f"No Nvidia GPU present on harness instance '{instance.id}'. "
"Skipping GPU-operator test."
)
LOG.warn(msg)
pytest.skip(msg)

# NOTE(aznashwan): considering the Nvidia gpu-operator's main purpose
# is to set up the drivers on the nodes, and that running the `gpu-operator`
# with pre-installed drivers can lead to incompatibilities between the
# version of the drivers and the rest of the toolchain, we skip the test
# if any of the drivers happened to be pre-loaded on the harness instance:
modules_loaded = _check_nvidia_drivers_loaded(instance)
if any(modules_loaded.values()):
msg = (
f"Cannot have any pre-loaded Nvidia GPU drivers before running "
f"the Nvidia 'gpu-operator' test on instance {instance.id}. "
f"Current Nvidia driver statuses: {modules_loaded}"
)
LOG.warn(msg)
pytest.skip(msg)

instance_release = util.get_os_version_id_for_instance(instance)
if (
instance_release
not in NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS[gpu_operator_version]
):
msg = (
f"Unsupported Ubuntu release '{instance_release}' for `gpu-operator` "
f"version '{gpu_operator_version}'. Skipping gpu-operator test."
)
LOG.warn(msg)
pytest.skip(msg)

# Add the upstream Nvidia GPU-operator Helm repo:
instance.exec(
["k8s", "helm", "repo", "add", "nvidia", NVIDIA_GPU_OPERATOR_HELM_CHART_REPO]
)
instance.exec(["k8s", "helm", "repo", "update"])

# Install `gpu-operator` chart:
instance.exec(
[
"k8s",
"helm",
"install",
"--generate-name",
"--wait",
"-n",
test_namespace,
"--create-namespace",
"nvidia/gpu-operator",
f"--version={gpu_operator_version}",
]
)

# Wait for the core daemonsets of the gpu-operator to be ready:
daemonsets = [
"nvidia-driver-daemonset",
"nvidia-device-plugin-daemonset",
"nvidia-container-toolkit-daemonset",
]
# NOTE(aznashwan): it takes on average a little under 10 minutes for all
# of the core daemonsets of the Nvidia GPU-operator to do their thing
# on an AWS `g4dn.xlarge` instance (4 vCPUs/16GiB RAM), so we offer a
# generous timeout of 15 minutes:
for daemonset in daemonsets:
util.wait_for_daemonset(
instance,
daemonset,
namespace=test_namespace,
retry_times=15,
retry_delay_s=60,
)

# Deploy a sample CUDA app and let it run to completion:
pod_spec_file = config.MANIFESTS_DIR / "cuda-vectoradd-nvidia-gpu-test-pod.yaml"
pod_spec = pod_spec_file.read_text().format(
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME
)
instance.exec(
["k8s", "kubectl", "-n", test_namespace, "apply", "-f", "-"],
input=pod_spec.encode(),
)
util.stubbornly(retries=3, delay_s=1).on(instance).exec(
[
"k8s",
"kubectl",
"-n",
test_namespace,
"wait",
"--for=condition=ready",
"pod",
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME,
"--timeout",
"180s",
]
)
1 change: 1 addition & 0 deletions tests/integration/tests/test_util/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
PULL_REQUEST = "pull_request"
NIGHTLY = "nightly"
WEEKLY = "weekly"
GPU = "gpu"

TEST_LEVELS = [PULL_REQUEST, NIGHTLY, WEEKLY]

Expand Down
73 changes: 73 additions & 0 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import shlex
import subprocess
import time
import urllib.request
from datetime import datetime
from functools import partial
Expand Down Expand Up @@ -554,3 +555,75 @@ def check_file_paths_exist(
p: not (f"cannot access '{p}': No such file or directory" in process.stderr)
for p in paths
}


def get_os_version_id_for_instance(instance: harness.Instance) -> str:
"""Returns the version of the OS on the given harness Instance
by reading the `VERSION_ID` from `/etc/os-release`.
"""
proc = instance.exec(["cat", "/etc/os-release"], capture_output=True)

release = None
var = "VERSION_ID"
for line in proc.stdout.split(b"\n"):
line = line.decode()
if line.startswith(var):
release = line.lstrip(f"{var}=")
break

if release is None:
raise ValueError(
f"Failed to parse OS release var '{var}' from OS release "
f"info: {proc.stdout}"
)

return release


def wait_for_daemonset(
instance: harness.Instance,
name: str,
namespace: str = "default",
retry_times: int = 5,
retry_delay_s: int = 60,
expected_pods_ready: int = 1,
):
"""Waits for the daemonset with the given name to have at least
`expected_pods_ready` pods ready."""
proc = None
for i in range(retry_times):
# NOTE: we can't reliably use `rollout status` on Daemonsets unless
# they have `RollingUpdate` strategy, so we must go by the number of
# pods which are Ready.
proc = instance.exec(
[
"k8s",
"kubectl",
"-n",
namespace,
"get",
"daemonset",
name,
"-o",
"jsonpath={.status.numberReady}",
],
check=True,
capture_output=True,
)
if int(proc.stdout.decode()) >= expected_pods_ready:
LOG.info(
f"Successfully waited for daemonset '{name}' after "
f"{(i+1)*retry_delay_s} seconds"
)
return

LOG.info(
f"Waiting {retry_delay_s} seconds for daemonset '{name}'.\n"
f"code: {proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
)
time.sleep(retry_delay_s)

raise AssertionError(
f"Daemonset '{name}' failed to have at least one pod ready after "
f"{retry_times} x {retry_delay_s} seconds."
)
Loading