diff --git a/tests/integration/templates/cuda-vectoradd-nvidia-gpu-test-pod.yaml b/tests/integration/templates/cuda-vectoradd-nvidia-gpu-test-pod.yaml new file mode 100644 index 000000000..ee45d076e --- /dev/null +++ b/tests/integration/templates/cuda-vectoradd-nvidia-gpu-test-pod.yaml @@ -0,0 +1,14 @@ +# Lifted 1:1 from: +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd +apiVersion: v1 +kind: Pod +metadata: + name: {} +spec: + restartPolicy: OnFailure + containers: + - name: cuda-vectoradd + image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" + resources: + limits: + nvidia.com/gpu: 1 diff --git a/tests/integration/tests/test_nvidia_gpu_operator.py b/tests/integration/tests/test_nvidia_gpu_operator.py new file mode 100644 index 000000000..e380f0b07 --- /dev/null +++ b/tests/integration/tests/test_nvidia_gpu_operator.py @@ -0,0 +1,174 @@ +# +# Copyright 2025 Canonical, Ltd. +# + +import logging +from typing import List, Mapping + +import pytest +from test_util import config, harness, tags, util + +LOG = logging.getLogger(__name__) + +NVIDIA_GPU_OPERATOR_HELM_CHART_REPO = "https://helm.ngc.nvidia.com/nvidia" + +# Mapping between the versions of the Nvidia `gpu-operator` and +# the host versions of Ubuntu they support. +# Because the `nvidia-driver-daemonset` pod included in the `gpu-operator` +# includes kernel drivers, its container image's release lifecycle is +# strictly tied to the version of Ubuntu on the host. +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html +NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS = {"v24.9.1": ["20.04", "22.04"]} + +NVIDIA_KERNEL_MODULE_NAMES = ["nvidia", "nvidia_uvm", "nvidia_modeset"] + +# Lifted 1:1 from: +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd +NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME = "cuda-vectoradd" + + +def _check_nvidia_gpu_present(instance: harness.Instance) -> bool: + """Checks whether at least one Nvidia GPU is available + by exec-ing `lspci` on the target instance.""" + proc = instance.exec(["lspci", "-k"], capture_output=True, text=True) + + for line in proc.stdout.split("\n"): + if "NVIDIA Corporation" in line: + LOG.info(f"Found NVIDIA GPU in lspci output: {line}") + return True + + LOG.info(f"Failed to find NVIDIA GPU in lspci output: {proc.stdout}") + return False + + +def _check_nvidia_drivers_loaded(instance: harness.Instance) -> Mapping[str, bool]: + """Ensures that Nvidia kernel modules are NOT loaded on + the given harness instance.""" + + proc = instance.exec(["lsmod"], capture_output=True, text=True) + modules_present = {m: False for m in NVIDIA_KERNEL_MODULE_NAMES} + for line in proc.stdout.split("\n"): + for mod in modules_present: + if line.startswith(mod): + modules_present[mod] = True + + LOG.info(f"Located the following Nvidia kernel modules {modules_present}") + return modules_present + + +@pytest.mark.node_count(1) +@pytest.mark.tags(tags.WEEKLY) +@pytest.mark.tags(tags.GPU) +@pytest.mark.parametrize( + "gpu_operator_version", NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS.keys() +) +def test_deploy_nvdia_gpu_operator( + instances: List[harness.Instance], gpu_operator_version: str +): + """Tests that the Nvidia `gpu-operator` can be deployed successfully + using the upstream Helm chart and a sample application running a small + CUDA workload gets scheduled and executed to completion. + """ + instance = instances[0] + test_namespace = "gpu-operator" + + # Prechecks to ensure the test instance is valid. + if not _check_nvidia_gpu_present(instance): + msg = ( + f"No Nvidia GPU present on harness instance '{instance.id}'. " + "Skipping GPU-operator test." + ) + LOG.warn(msg) + pytest.skip(msg) + + # NOTE(aznashwan): considering the Nvidia gpu-operator's main purpose + # is to set up the drivers on the nodes, and that running the `gpu-operator` + # with pre-installed drivers can lead to incompatibilities between the + # version of the drivers and the rest of the toolchain, we skip the test + # if any of the drivers happened to be pre-loaded on the harness instance: + modules_loaded = _check_nvidia_drivers_loaded(instance) + if any(modules_loaded.values()): + msg = ( + f"Cannot have any pre-loaded Nvidia GPU drivers before running " + f"the Nvidia 'gpu-operator' test on instance {instance.id}. " + f"Current Nvidia driver statuses: {modules_loaded}" + ) + LOG.warn(msg) + pytest.skip(msg) + + instance_release = util.get_os_version_id_for_instance(instance) + if ( + instance_release + not in NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS[gpu_operator_version] + ): + msg = ( + f"Unsupported Ubuntu release '{instance_release}' for `gpu-operator` " + f"version '{gpu_operator_version}'. Skipping gpu-operator test." + ) + LOG.warn(msg) + pytest.skip(msg) + + # Add the upstream Nvidia GPU-operator Helm repo: + instance.exec( + ["k8s", "helm", "repo", "add", "nvidia", NVIDIA_GPU_OPERATOR_HELM_CHART_REPO] + ) + instance.exec(["k8s", "helm", "repo", "update"]) + + # Install `gpu-operator` chart: + instance.exec( + [ + "k8s", + "helm", + "install", + "--generate-name", + "--wait", + "-n", + test_namespace, + "--create-namespace", + "nvidia/gpu-operator", + f"--version={gpu_operator_version}", + ] + ) + + # Wait for the core daemonsets of the gpu-operator to be ready: + daemonsets = [ + "nvidia-driver-daemonset", + "nvidia-device-plugin-daemonset", + "nvidia-container-toolkit-daemonset", + ] + # NOTE(aznashwan): it takes on average a little under 10 minutes for all + # of the core daemonsets of the Nvidia GPU-operator to do their thing + # on an AWS `g4dn.xlarge` instance (4 vCPUs/16GiB RAM), so we offer a + # generous timeout of 15 minutes: + for daemonset in daemonsets: + util.wait_for_daemonset( + instance, + daemonset, + namespace=test_namespace, + retry_times=15, + retry_delay_s=60, + ) + + # Deploy a sample CUDA app and let it run to completion: + pod_spec_file = config.MANIFESTS_DIR / "cuda-vectoradd-nvidia-gpu-test-pod.yaml" + pod_spec = pod_spec_file.read_text().format( + NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME + ) + instance.exec( + ["k8s", "kubectl", "-n", test_namespace, "apply", "-f", "-"], + input=pod_spec.encode(), + ) + util.stubbornly(retries=3, delay_s=1).on(instance).exec( + [ + "k8s", + "kubectl", + "-n", + test_namespace, + "wait", + "--for=condition=ready", + "pod", + NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME, + "--timeout", + "180s", + ] + ) diff --git a/tests/integration/tests/test_util/tags.py b/tests/integration/tests/test_util/tags.py index 49f0204df..94cbfad5e 100644 --- a/tests/integration/tests/test_util/tags.py +++ b/tests/integration/tests/test_util/tags.py @@ -6,6 +6,7 @@ PULL_REQUEST = "pull_request" NIGHTLY = "nightly" WEEKLY = "weekly" +GPU = "gpu" TEST_LEVELS = [PULL_REQUEST, NIGHTLY, WEEKLY] diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py index a403c41b5..87997e840 100644 --- a/tests/integration/tests/test_util/util.py +++ b/tests/integration/tests/test_util/util.py @@ -7,6 +7,7 @@ import re import shlex import subprocess +import time import urllib.request from datetime import datetime from functools import partial @@ -554,3 +555,75 @@ def check_file_paths_exist( p: not (f"cannot access '{p}': No such file or directory" in process.stderr) for p in paths } + + +def get_os_version_id_for_instance(instance: harness.Instance) -> str: + """Returns the version of the OS on the given harness Instance + by reading the `VERSION_ID` from `/etc/os-release`. + """ + proc = instance.exec(["cat", "/etc/os-release"], capture_output=True) + + release = None + var = "VERSION_ID" + for line in proc.stdout.split(b"\n"): + line = line.decode() + if line.startswith(var): + release = line.lstrip(f"{var}=") + break + + if release is None: + raise ValueError( + f"Failed to parse OS release var '{var}' from OS release " + f"info: {proc.stdout}" + ) + + return release + + +def wait_for_daemonset( + instance: harness.Instance, + name: str, + namespace: str = "default", + retry_times: int = 5, + retry_delay_s: int = 60, + expected_pods_ready: int = 1, +): + """Waits for the daemonset with the given name to have at least + `expected_pods_ready` pods ready.""" + proc = None + for i in range(retry_times): + # NOTE: we can't reliably use `rollout status` on Daemonsets unless + # they have `RollingUpdate` strategy, so we must go by the number of + # pods which are Ready. + proc = instance.exec( + [ + "k8s", + "kubectl", + "-n", + namespace, + "get", + "daemonset", + name, + "-o", + "jsonpath={.status.numberReady}", + ], + check=True, + capture_output=True, + ) + if int(proc.stdout.decode()) >= expected_pods_ready: + LOG.info( + f"Successfully waited for daemonset '{name}' after " + f"{(i+1)*retry_delay_s} seconds" + ) + return + + LOG.info( + f"Waiting {retry_delay_s} seconds for daemonset '{name}'.\n" + f"code: {proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}" + ) + time.sleep(retry_delay_s) + + raise AssertionError( + f"Daemonset '{name}' failed to have at least one pod ready after " + f"{retry_times} x {retry_delay_s} seconds." + )